From 56a5ab13e0a4d37d93ee6f579db2d20b3b0027cd Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Fri, 9 Jun 2023 16:44:45 +0800
Subject: [PATCH 01/12] origin code

---
 .../MANIFEST.in                               |    1 +
 .../docs/Makefile                             |   20 +
 .../docs/conf.py                              |   98 +
 .../docs/docutils.conf                        |    2 +
 .../docs/make.bat                             |   36 +
 .../fairseq/__init__.py                       |   45 +
 .../fairseq/benchmark/__init__.py             |    7 +
 .../benchmark_multihead_attention.py          |  172 ++
 .../fairseq/benchmark/dummy_dataset.py        |   36 +
 .../fairseq/benchmark/dummy_lm.py             |   83 +
 .../fairseq/benchmark/dummy_masked_lm.py      |   94 +
 .../fairseq/benchmark/dummy_model.py          |   96 +
 .../fairseq/benchmark/dummy_mt.py             |  119 ++
 .../fairseq/binarizer.py                      |  381 ++++
 .../fairseq/checkpoint_utils.py               |  918 ++++++++
 .../clib/libbase/balanced_assignment.cpp      |  109 +
 .../fairseq/clib/libbleu/libbleu.cpp          |  157 ++
 .../fairseq/clib/libbleu/module.cpp           |   33 +
 .../fairseq/clib/libnat/edit_dist.cpp         |  231 +++
 .../fairseq/config/__init__.py                |    4 +
 .../fairseq/config/config.yaml                |   19 +
 .../fairseq/config/fb_run_config/slurm.yaml   |   29 +
 .../transformer_lm_baevski_gbw.yaml           |   36 +
 .../transformer_lm_baevski_wiki103.yaml       |   36 +
 .../transformer_lm/transformer_lm_big.yaml    |   36 +
 .../transformer_lm/transformer_lm_gbw.yaml    |   36 +
 .../transformer_lm/transformer_lm_gpt.yaml    |   36 +
 .../transformer_lm_gpt2_big.yaml              |   36 +
 .../transformer_lm_gpt2_medium.yaml           |   36 +
 .../transformer_lm_gpt2_small.yaml            |   36 +
 .../transformer_lm_wiki103.yaml               |   36 +
 .../model/wav2vec/vq_wav2vec_gumbel.yaml      |    5 +
 .../config/model/wav2vec2/wav2vec2_base.yaml  |    8 +
 .../config/model/wav2vec2/wav2vec2_large.yaml |   20 +
 .../fairseq/criterions/__init__.py            |   36 +
 .../fairseq/criterions/adaptive_loss.py       |  124 ++
 .../fairseq/criterions/composite_loss.py      |  100 +
 .../fairseq/criterions/cross_entropy.py       |   91 +
 .../fairseq/criterions/ctc.py                 |  325 +++
 .../fairseq/criterions/fairseq_criterion.py   |  121 ++
 .../fairseq/criterions/fastspeech2_loss.py    |  137 ++
 .../fairseq/criterions/hubert_criterion.py    |  195 ++
 .../label_smoothed_cross_entropy.py           |  168 ++
 ...moothed_cross_entropy_latency_augmented.py |  221 ++
 ...l_smoothed_cross_entropy_with_alignment.py |  131 ++
 .../label_smoothed_cross_entropy_with_ctc.py  |   97 +
 ...label_smoothed_cross_entropy_with_rdrop.py |  177 ++
 .../fairseq/criterions/legacy_masked_lm.py    |  178 ++
 .../fairseq/criterions/masked_lm.py           |   99 +
 .../fairseq/criterions/model_criterion.py     |  177 ++
 .../fairseq/criterions/nat_loss.py            |  181 ++
 .../fairseq/criterions/sentence_prediction.py |  288 +++
 .../sentence_prediction_adapters.py           |   63 +
 .../fairseq/criterions/sentence_ranking.py    |  121 ++
 .../criterions/speech_dlm_criterion.py        |  335 +++
 .../criterions/speech_to_speech_criterion.py  |  517 +++++
 .../criterions/speech_ulm_criterion.py        |  126 ++
 .../fairseq/criterions/tacotron2_loss.py      |  227 ++
 .../fairseq/criterions/wav2vec_criterion.py   |  231 +++
 .../fairseq/data/__init__.py                  |  137 ++
 .../fairseq/data/add_class_target_dataset.py  |   79 +
 .../fairseq/data/add_target_dataset.py        |   83 +
 .../fairseq/data/append_token_dataset.py      |   41 +
 .../fairseq/data/audio/__init__.py            |   93 +
 .../fairseq/data/audio/audio_utils.py         |  389 ++++
 .../fairseq/data/audio/data_cfg.py            |  387 ++++
 .../data/audio/dataset_transforms/__init__.py |   53 +
 .../audio/dataset_transforms/concataugment.py |   61 +
 .../dataset_transforms/noisyoverlapaugment.py |  105 +
 .../data/audio/feature_transforms/__init__.py |   43 +
 .../audio/feature_transforms/delta_deltas.py  |   37 +
 .../audio/feature_transforms/global_cmvn.py   |   29 +
 .../audio/feature_transforms/specaugment.py   |  131 ++
 .../feature_transforms/utterance_cmvn.py      |   41 +
 .../data/audio/frm_text_to_speech_dataset.py  |  205 ++
 .../fairseq/data/audio/hubert_dataset.py      |  356 ++++
 .../data/audio/multi_modality_dataset.py      |  267 +++
 .../fairseq/data/audio/raw_audio_dataset.py   |  427 ++++
 .../data/audio/speech_to_speech_dataset.py    |  379 ++++
 .../data/audio/speech_to_text_dataset.py      |  733 +++++++
 .../audio/speech_to_text_joint_dataset.py     |  359 ++++
 .../data/audio/text_to_speech_dataset.py      |  250 +++
 .../audio/waveform_transforms/__init__.py     |   48 +
 .../audio/waveform_transforms/noiseaugment.py |  201 ++
 .../fairseq/data/backtranslation_dataset.py   |  165 ++
 .../fairseq/data/base_wrapper_dataset.py      |   78 +
 .../fairseq/data/bucket_pad_length_dataset.py |   78 +
 .../fairseq/data/codedataset.py               |  576 +++++
 .../fairseq/data/colorize_dataset.py          |   25 +
 .../fairseq/data/concat_dataset.py            |  124 ++
 .../fairseq/data/concat_sentences_dataset.py  |   54 +
 .../fairseq/data/data_utils.py                | 1144 ++++++++++
 .../fairseq/data/data_utils_fast.pyx          |  178 ++
 .../fairseq/data/denoising_dataset.py         |  443 ++++
 .../fairseq/data/dictionary.py                |  401 ++++
 .../fairseq/data/encoders/__init__.py         |   29 +
 .../fairseq/data/encoders/byte_bpe.py         |   48 +
 .../fairseq/data/encoders/byte_utils.py       |   51 +
 .../fairseq/data/encoders/bytes.py            |   34 +
 .../fairseq/data/encoders/characters.py       |   30 +
 .../fairseq/data/encoders/fastbpe.py          |   36 +
 .../fairseq/data/encoders/gpt2_bpe.py         |   45 +
 .../fairseq/data/encoders/gpt2_bpe_utils.py   |  140 ++
 .../fairseq/data/encoders/hf_bert_bpe.py      |   50 +
 .../fairseq/data/encoders/hf_byte_bpe.py      |   50 +
 .../fairseq/data/encoders/moses_tokenizer.py  |   49 +
 .../fairseq/data/encoders/nltk_tokenizer.py   |   24 +
 .../data/encoders/sentencepiece_bpe.py        |   65 +
 .../fairseq/data/encoders/space_tokenizer.py  |   21 +
 .../fairseq/data/encoders/subword_nmt_bpe.py  |   54 +
 .../fairseq/data/encoders/utils.py            |   30 +
 .../fairseq/data/fairseq_dataset.py           |  205 ++
 .../fairseq/data/fasta_dataset.py             |  107 +
 .../fairseq/data/huffman/__init__.py          |   21 +
 .../fairseq/data/huffman/huffman_coder.py     |  267 +++
 .../huffman/huffman_mmap_indexed_dataset.py   |  287 +++
 .../fairseq/data/id_dataset.py                |   19 +
 .../fairseq/data/indexed_dataset.py           |  592 ++++++
 .../fairseq/data/iterators.py                 |  879 ++++++++
 .../fairseq/data/language_pair_dataset.py     |  477 +++++
 .../fairseq/data/legacy/__init__.py           |   16 +
 .../fairseq/data/legacy/block_pair_dataset.py |  311 +++
 .../fairseq/data/legacy/masked_lm_dataset.py  |  303 +++
 .../data/legacy/masked_lm_dictionary.py       |   60 +
 .../fairseq/data/list_dataset.py              |   32 +
 .../fairseq/data/lm_context_window_dataset.py |   97 +
 .../fairseq/data/lru_cache_dataset.py         |   21 +
 .../fairseq/data/mask_tokens_dataset.py       |  226 ++
 .../fairseq/data/monolingual_dataset.py       |  253 +++
 .../fairseq/data/multi_corpus_dataset.py      |  285 +++
 .../data/multi_corpus_sampled_dataset.py      |  152 ++
 .../fairseq/data/multilingual/__init__.py     |    4 +
 .../multilingual/multilingual_data_manager.py | 1156 +++++++++++
 .../data/multilingual/multilingual_utils.py   |   63 +
 .../multilingual/sampled_multi_dataset.py     |  468 +++++
 .../sampled_multi_epoch_dataset.py            |  199 ++
 .../data/multilingual/sampling_method.py      |   78 +
 .../fairseq/data/nested_dictionary_dataset.py |  125 ++
 .../fairseq/data/noising.py                   |  334 +++
 .../fairseq/data/num_samples_dataset.py       |   17 +
 .../fairseq/data/numel_dataset.py             |   31 +
 .../fairseq/data/offset_tokens_dataset.py     |   15 +
 .../fairseq/data/pad_dataset.py               |   31 +
 .../fairseq/data/padding_mask_dataset.py      |   38 +
 .../fairseq/data/plasma_utils.py              |  197 ++
 .../fairseq/data/prepend_dataset.py           |   28 +
 .../fairseq/data/prepend_token_dataset.py     |   41 +
 .../fairseq/data/raw_label_dataset.py         |   23 +
 .../fairseq/data/replace_dataset.py           |   36 +
 .../fairseq/data/resampling_dataset.py        |  139 ++
 .../fairseq/data/roll_dataset.py              |   18 +
 .../fairseq/data/round_robin_zip_datasets.py  |  160 ++
 .../fairseq/data/shorten_dataset.py           |   78 +
 .../fairseq/data/sort_dataset.py              |   21 +
 .../fairseq/data/span_mask_tokens_dataset.py  |  293 +++
 .../fairseq/data/speech_dlm_dataset.py        |  307 +++
 .../fairseq/data/strip_token_dataset.py       |   20 +
 .../fairseq/data/subsample_dataset.py         |   75 +
 .../fairseq/data/text_compressor.py           |   58 +
 .../fairseq/data/token_block_dataset.py       |  206 ++
 .../fairseq/data/token_block_utils_fast.pyx   |  187 ++
 .../transform_eos_concat_langpair_dataset.py  |  139 ++
 .../fairseq/data/transform_eos_dataset.py     |  120 ++
 .../data/transform_eos_lang_pair_dataset.py   |  113 +
 .../fairseq/dataclass/__init__.py             |   13 +
 .../fairseq/dataclass/configs.py              | 1147 ++++++++++
 .../fairseq/dataclass/constants.py            |   56 +
 .../fairseq/dataclass/initialize.py           |   61 +
 .../fairseq/dataclass/utils.py                |  510 +++++
 .../fairseq/distributed/__init__.py           |   25 +
 .../distributed_timeout_wrapper.py            |   97 +
 .../fully_sharded_data_parallel.py            |  145 ++
 .../legacy_distributed_data_parallel.py       |  165 ++
 .../distributed/module_proxy_wrapper.py       |   56 +
 .../tpu_distributed_data_parallel.py          |   43 +
 .../fairseq/distributed/utils.py              |  843 ++++++++
 .../fairseq/file_chunker_utils.py             |   84 +
 .../fairseq/file_io.py                        |  196 ++
 .../fairseq/file_utils.py                     |  370 ++++
 .../fairseq/hub_utils.py                      |  326 +++
 .../fairseq/incremental_decoding_utils.py     |   51 +
 .../fairseq/iterative_refinement_generator.py |  359 ++++
 .../fairseq/logging/__init__.py               |    0
 .../fairseq/logging/meters.py                 |  351 ++++
 .../fairseq/logging/metrics.py                |  336 +++
 .../fairseq/logging/progress_bar.py           |  582 ++++++
 .../fairseq/model_parallel/__init__.py        |    6 +
 .../model_parallel/criterions/__init__.py     |   14 +
 .../vocab_parallel_cross_entropy.py           |   88 +
 .../model_parallel/megatron_trainer.py        |   75 +
 .../fairseq/model_parallel/models/__init__.py |   20 +
 .../pipeline_parallel_transformer/__init__.py |    6 +
 .../pipeline_parallel_transformer/layers.py   |  600 ++++++
 .../pipeline_parallel_transformer/model.py    |  789 +++++++
 .../model_parallel/models/transformer.py      |  121 ++
 .../model_parallel/models/transformer_lm.py   |  169 ++
 .../model_parallel/modules/__init__.py        |   17 +
 .../modules/multihead_attention.py            |  349 ++++
 .../modules/transformer_layer.py              |   78 +
 .../fairseq/models/__init__.py                |  236 +++
 .../fairseq/models/bart/__init__.py           |    7 +
 .../fairseq/models/bart/hub_interface.py      |  211 ++
 .../fairseq/models/bart/model.py              |  394 ++++
 .../fairseq/models/composite_encoder.py       |   57 +
 .../models/distributed_fairseq_model.py       |  147 ++
 .../fairseq/models/ema/__init__.py            |   20 +
 .../fairseq/models/ema/ema.py                 |  209 ++
 .../fairseq/models/fairseq_decoder.py         |  104 +
 .../fairseq/models/fairseq_encoder.py         |   92 +
 .../models/fairseq_incremental_decoder.py     |  118 ++
 .../fairseq/models/fairseq_model.py           |  579 ++++++
 .../fairseq/models/fconv.py                   |  756 +++++++
 .../fairseq/models/fconv_lm.py                |  136 ++
 .../fairseq/models/fconv_self_att.py          |  674 ++++++
 .../fairseq/models/hubert/__init__.py         |    7 +
 .../fairseq/models/hubert/hubert.py           |  570 +++++
 .../fairseq/models/hubert/hubert_asr.py       |  675 ++++++
 .../fairseq/models/huggingface/__init__.py    |   20 +
 .../fairseq/models/huggingface/hf_gpt2.py     |  168 ++
 .../fairseq/models/lightconv.py               | 1119 ++++++++++
 .../fairseq/models/lightconv_lm.py            |  306 +++
 .../fairseq/models/lstm.py                    |  755 +++++++
 .../fairseq/models/lstm_lm.py                 |  142 ++
 .../fairseq/models/masked_lm.py               |  404 ++++
 .../fairseq/models/model_utils.py             |   92 +
 .../models/multilingual_transformer.py        |  229 ++
 .../fairseq/models/nat/__init__.py            |   13 +
 .../fairseq/models/nat/cmlm_transformer.py    |  162 ++
 .../fairseq/models/nat/fairseq_nat_model.py   |  172 ++
 .../models/nat/insertion_transformer.py       |  280 +++
 ...iterative_nonautoregressive_transformer.py |  228 ++
 .../models/nat/levenshtein_transformer.py     |  510 +++++
 .../fairseq/models/nat/levenshtein_utils.py   |  293 +++
 .../fairseq/models/nat/nat_crf_transformer.py |  121 ++
 .../models/nat/nonautoregressive_ensembles.py |  254 +++
 .../nat/nonautoregressive_transformer.py      |  456 ++++
 .../fairseq/models/roberta/__init__.py        |   11 +
 .../fairseq/models/roberta/alignment_utils.py |  118 ++
 .../fairseq/models/roberta/enc_dec.py         |  192 ++
 .../fairseq/models/roberta/hub_interface.py   |  235 +++
 .../fairseq/models/roberta/model.py           |  700 +++++++
 .../fairseq/models/roberta/model_camembert.py |   50 +
 .../fairseq/models/roberta/model_gottbert.py  |   49 +
 .../fairseq/models/roberta/model_xlmr.py      |   46 +
 .../fairseq/models/speech_dlm/__init__.py     |    7 +
 .../models/speech_dlm/hub_interface.py        |  192 ++
 .../models/speech_dlm/modules/__init__.py     |    0
 .../speech_dlm/modules/speech_dlm_decoder.py  |  572 +++++
 .../modules/speech_dlm_decoder_layer.py       |  717 +++++++
 .../speech_dlm/sequence_generator/__init__.py |    6 +
 .../sequence_generator/multichannel_search.py |  430 ++++
 .../multichannel_sequence_generator.py        | 1110 ++++++++++
 .../fairseq/models/speech_dlm/speech_dlm.py   |  280 +++
 .../models/speech_to_speech/__init__.py       |    9 +
 .../speech_to_speech/modules/__init__.py      |    0
 .../speech_to_speech/modules/ctc_decoder.py   |   18 +
 .../modules/stacked_embedding.py              |   48 +
 .../modules/transformer_decoder_aug.py        |  108 +
 .../modules/transformer_encoder.py            |   85 +
 .../models/speech_to_speech/s2s_conformer.py  |  172 ++
 .../s2s_conformer_translatotron2.py           |  262 +++
 .../speech_to_speech/s2s_conformer_unity.py   |  298 +++
 .../speech_to_speech/s2s_transformer.py       |  722 +++++++
 .../fairseq/models/speech_to_text/__init__.py |   13 +
 .../fairseq/models/speech_to_text/berard.py   |  607 ++++++
 .../models/speech_to_text/convtransformer.py  |  443 ++++
 .../models/speech_to_text/hub_interface.py    |  128 ++
 .../models/speech_to_text/modules/__init__.py |    0
 .../modules/augmented_memory_attention.py     |  487 +++++
 .../speech_to_text/modules/convolution.py     |  126 ++
 .../models/speech_to_text/modules/emformer.py | 1844 +++++++++++++++++
 .../speech_to_text/multi_modality_model.py    |   49 +
 .../models/speech_to_text/s2t_conformer.py    |  234 +++
 .../models/speech_to_text/s2t_transformer.py  |  552 +++++
 .../speech_to_text/s2t_wav_transformer.py     |  504 +++++
 .../fairseq/models/speech_to_text/utils.py    |  562 +++++
 .../models/speech_to_text/xm_transformer.py   |  855 ++++++++
 .../speech_to_text/xm_transformer_unity.py    |  315 +++
 .../fairseq/models/text_to_speech/__init__.py |    9 +
 .../models/text_to_speech/codehifigan.py      |   95 +
 .../models/text_to_speech/fastspeech2.py      |  448 ++++
 .../fairseq/models/text_to_speech/hifigan.py  |  179 ++
 .../models/text_to_speech/hub_interface.py    |  188 ++
 .../models/text_to_speech/tacotron2.py        |  380 ++++
 .../models/text_to_speech/tts_transformer.py  |  454 ++++
 .../fairseq/models/text_to_speech/vocoder.py  |  305 +++
 .../fairseq/models/transformer/__init__.py    |   50 +
 .../models/transformer/transformer_base.py    |  193 ++
 .../models/transformer/transformer_config.py  |  341 +++
 .../models/transformer/transformer_decoder.py |  482 +++++
 .../transformer/transformer_decoder_aug.py    |  392 ++++
 .../models/transformer/transformer_encoder.py |  370 ++++
 .../models/transformer/transformer_legacy.py  |  277 +++
 .../fairseq/models/transformer_align.py       |   93 +
 .../models/transformer_from_pretrained_xlm.py |  152 ++
 .../fairseq/models/transformer_lm.py          |  607 ++++++
 .../fairseq/models/transformer_ulm.py         |  408 ++++
 .../fairseq/models/wav2vec/__init__.py        |    9 +
 .../fairseq/models/wav2vec/utils.py           |   21 +
 .../fairseq/models/wav2vec/wav2vec.py         |  630 ++++++
 .../fairseq/models/wav2vec/wav2vec2.py        | 1284 ++++++++++++
 .../fairseq/models/wav2vec/wav2vec2_asr.py    |  837 ++++++++
 .../fairseq/models/wav2vec/wav2vec2_laser.py  |   39 +
 .../fairseq/models/xmod/__init__.py           |    7 +
 .../fairseq/models/xmod/hub_interface.py      |   51 +
 .../fairseq/models/xmod/model.py              |  742 +++++++
 .../models/xmod/transformer_layer_xmod.py     |  179 ++
 .../fairseq/modules/__init__.py               |  106 +
 .../fairseq/modules/adaptive_input.py         |   81 +
 .../fairseq/modules/adaptive_softmax.py       |  268 +++
 .../fairseq/modules/base_layer.py             |  170 ++
 .../fairseq/modules/beamable_mm.py            |   49 +
 .../modules/character_token_embedder.py       |  214 ++
 .../fairseq/modules/checkpoint_activations.py |  242 +++
 .../fairseq/modules/conformer_layer.py        |  301 +++
 .../fairseq/modules/conv_tbc.py               |   53 +
 .../fairseq/modules/cross_entropy.py          |   59 +
 .../fairseq/modules/cuda_utils.cu             |  202 ++
 .../downsampled_multihead_attention.py        |  317 +++
 .../fairseq/modules/dynamic_convolution.py    |  526 +++++
 .../fairseq/modules/dynamic_crf_layer.py      |  189 ++
 .../modules/dynamicconv_layer/__init__.py     |    6 +
 .../dynamicconv_layer/cuda_function_gen.py    |  223 ++
 .../dynamicconv_layer/dynamicconv_cuda.cpp    |   51 +
 .../dynamicconv_layer/dynamicconv_cuda.cuh    |   50 +
 .../dynamicconv_cuda_kernel.cu                |  176 ++
 .../dynamicconv_layer/dynamicconv_layer.py    |  227 ++
 .../dynamicconv_layer/dynamiconv_cpu.cpp      |   29 +
 .../modules/dynamicconv_layer/setup.py        |   23 +
 .../fairseq/modules/ema_module.py             |  215 ++
 .../modules/espnet_multihead_attention.py     |  256 +++
 .../fairseq/modules/fairseq_dropout.py        |   51 +
 .../fairseq/modules/fp32_batch_norm.py        |   44 +
 .../fairseq/modules/fp32_group_norm.py        |   25 +
 .../fairseq/modules/fp32_instance_norm.py     |   35 +
 .../fairseq/modules/gelu.py                   |   25 +
 .../fairseq/modules/grad_multiply.py          |   18 +
 .../modules/gumbel_vector_quantizer.py        |  212 ++
 .../fairseq/modules/kmeans_attention.py       |  744 +++++++
 .../modules/kmeans_vector_quantizer.py        |  128 ++
 .../fairseq/modules/layer_drop.py             |   44 +
 .../fairseq/modules/layer_norm.py             |   48 +
 .../modules/learned_positional_embedding.py   |   61 +
 .../modules/lightconv_layer/__init__.py       |    6 +
 .../lightconv_layer/cuda_function_gen.py      |  289 +++
 .../lightconv_layer/lightconv_cuda.cpp        |   51 +
 .../lightconv_layer/lightconv_cuda.cuh        |   79 +
 .../lightconv_layer/lightconv_cuda_kernel.cu  |  400 ++++
 .../lightconv_layer/lightconv_layer.py        |  137 ++
 .../fairseq/modules/lightconv_layer/setup.py  |   23 +
 .../modules/lightweight_convolution.py        |  310 +++
 .../fairseq/modules/linearized_convolution.py |  125 ++
 .../fairseq/modules/location_attention.py     |   83 +
 .../fairseq/modules/lstm_cell_with_zoneout.py |   37 +
 .../fairseq/modules/multihead_attention.py    |  910 ++++++++
 .../fairseq/modules/positional_embedding.py   |   35 +
 .../fairseq/modules/positional_encoding.py    |  129 ++
 .../fairseq/modules/quant_noise.py            |  107 +
 .../fairseq/modules/quantization/__init__.py  |    0
 .../modules/quantization/pq/__init__.py       |    6 +
 .../fairseq/modules/quantization/pq/em.py     |  211 ++
 .../quantization/pq/modules/__init__.py       |    8 +
 .../modules/quantization/pq/modules/qconv.py  |  115 +
 .../modules/quantization/pq/modules/qemb.py   |  107 +
 .../quantization/pq/modules/qlinear.py        |   71 +
 .../fairseq/modules/quantization/pq/pq.py     |  128 ++
 .../fairseq/modules/quantization/pq/utils.py  |  376 ++++
 .../quantization/quantization_options.py      |   44 +
 .../modules/quantization/scalar/__init__.py   |    6 +
 .../quantization/scalar/modules/__init__.py   |    9 +
 .../quantization/scalar/modules/qact.py       |   88 +
 .../quantization/scalar/modules/qconv.py      |  149 ++
 .../quantization/scalar/modules/qemb.py       |  147 ++
 .../quantization/scalar/modules/qlinear.py    |  113 +
 .../modules/quantization/scalar/ops.py        |   59 +
 .../modules/quantization/scalar/utils.py      |   80 +
 .../modules/rotary_positional_embedding.py    |   51 +
 .../fairseq/modules/same_pad.py               |   33 +
 .../fairseq/modules/scalar_bias.py            |   31 +
 .../sinusoidal_positional_embedding.py        |  105 +
 .../modules/sparse_multihead_attention.py     |  140 ++
 .../sparse_transformer_sentence_encoder.py    |   96 +
 ...arse_transformer_sentence_encoder_layer.py |   51 +
 .../fairseq/modules/transformer_layer.py      |  562 +++++
 .../fairseq/modules/transformer_layer_aug.py  |  315 +++
 .../modules/transformer_sentence_encoder.py   |  291 +++
 .../transformer_sentence_encoder_layer.py     |  139 ++
 .../fairseq/modules/transpose_last.py         |   21 +
 .../fairseq/modules/unfold.py                 |   19 +
 .../fairseq/modules/vggblock.py               |  116 ++
 .../fairseq/nan_detector.py                   |  108 +
 .../fairseq/ngram_repeat_block.py             |  120 ++
 .../fairseq/optim/__init__.py                 |   48 +
 .../fairseq/optim/adadelta.py                 |   47 +
 .../fairseq/optim/adafactor.py                |  268 +++
 .../fairseq/optim/adagrad.py                  |   40 +
 .../fairseq/optim/adam.py                     |  239 +++
 .../fairseq/optim/adamax.py                   |  172 ++
 .../fairseq/optim/amp_optimizer.py            |  106 +
 .../fairseq/optim/bmuf.py                     |  200 ++
 .../fairseq/optim/composite.py                |  273 +++
 .../fairseq/optim/cpu_adam.py                 |  210 ++
 .../fairseq/optim/dynamic_loss_scaler.py      |   70 +
 .../fairseq/optim/fairseq_optimizer.py        |  187 ++
 .../fairseq/optim/fp16_optimizer.py           |  558 +++++
 .../fairseq/optim/fused_adam.py               |  389 ++++
 .../fairseq/optim/fused_lamb.py               |   51 +
 .../fairseq/optim/lr_scheduler/__init__.py    |   36 +
 .../optim/lr_scheduler/cosine_lr_scheduler.py |  146 ++
 .../lr_scheduler/fairseq_lr_scheduler.py      |   59 +
 .../optim/lr_scheduler/fixed_schedule.py      |   76 +
 .../inverse_square_root_schedule.py           |   85 +
 .../optim/lr_scheduler/manual_lr_scheduler.py |  121 ++
 .../optim/lr_scheduler/pass_through.py        |   39 +
 .../lr_scheduler/polynomial_decay_schedule.py |   89 +
 .../lr_scheduler/reduce_lr_on_plateau.py      |  143 ++
 .../optim/lr_scheduler/step_lr_scheduler.py   |   85 +
 .../lr_scheduler/tri_stage_lr_scheduler.py    |  175 ++
 .../lr_scheduler/triangular_lr_scheduler.py   |   83 +
 .../fairseq/optim/nag.py                      |  111 +
 .../fairseq/optim/sgd.py                      |   43 +
 .../fairseq/optim/shard.py                    |   58 +
 .../fairseq/options.py                        |  413 ++++
 .../fairseq/pdb.py                            |   47 +
 .../fairseq/quantization_utils.py             |  143 ++
 .../fairseq/registry.py                       |  104 +
 .../fairseq/scoring/__init__.py               |   55 +
 .../fairseq/scoring/bertscore.py              |   44 +
 .../fairseq/scoring/bleu.py                   |  168 ++
 .../fairseq/scoring/chrf.py                   |   36 +
 .../fairseq/scoring/meteor.py                 |   42 +
 .../fairseq/scoring/tokenizer.py              |   80 +
 .../fairseq/scoring/wer.py                    |   58 +
 .../fairseq/search.py                         |  892 ++++++++
 .../fairseq/sequence_generator.py             | 1020 +++++++++
 .../fairseq/sequence_scorer.py                |  153 ++
 .../fairseq/speech_generator.py               |  427 ++++
 .../fairseq/tasks/__init__.py                 |  138 ++
 .../fairseq/tasks/audio_finetuning.py         |  344 +++
 .../fairseq/tasks/audio_pretraining.py        |  207 ++
 .../fairseq/tasks/cross_lingual_lm.py         |  191 ++
 .../fairseq/tasks/denoising.py                |  296 +++
 .../fairseq/tasks/fairseq_task.py             |  708 +++++++
 .../fairseq/tasks/frm_text_to_speech.py       |   55 +
 .../fairseq/tasks/hubert_pretraining.py       |  191 ++
 .../fairseq/tasks/language_modeling.py        |  383 ++++
 .../fairseq/tasks/legacy_masked_lm.py         |  152 ++
 .../fairseq/tasks/masked_lm.py                |  327 +++
 .../fairseq/tasks/multilingual_denoising.py   |  268 +++
 .../tasks/multilingual_language_modeling.py   |  627 ++++++
 .../fairseq/tasks/multilingual_masked_lm.py   |  338 +++
 .../fairseq/tasks/multilingual_translation.py |  463 +++++
 .../fairseq/tasks/nlu_finetuning.py           |  477 +++++
 .../fairseq/tasks/online_backtranslation.py   |  683 ++++++
 .../tasks/semisupervised_translation.py       |  485 +++++
 .../fairseq/tasks/sentence_prediction.py      |  303 +++
 .../tasks/sentence_prediction_adapters.py     |   56 +
 .../fairseq/tasks/sentence_ranking.py         |  219 ++
 .../fairseq/tasks/simultaneous_translation.py |   41 +
 .../fairseq/tasks/span_masked_lm.py           |  243 +++
 .../fairseq/tasks/speech_dlm_task.py          |  561 +++++
 .../fairseq/tasks/speech_to_speech.py         |  597 ++++++
 .../fairseq/tasks/speech_to_text.py           |  350 ++++
 .../fairseq/tasks/speech_ulm_task.py          |  224 ++
 .../fairseq/tasks/text_to_speech.py           |  501 +++++
 .../fairseq/tasks/translation.py              |  498 +++++
 .../tasks/translation_from_pretrained_bart.py |  132 ++
 .../tasks/translation_from_pretrained_xlm.py  |   39 +
 .../fairseq/tasks/translation_lev.py          |  195 ++
 .../tasks/translation_multi_simple_epoch.py   |  441 ++++
 .../fairseq/token_generation_constraints.py   |  506 +++++
 .../fairseq/tokenizer.py                      |   15 +
 .../fairseq/trainer.py                        | 1622 +++++++++++++++
 .../fairseq/utils.py                          |  951 +++++++++
 .../fairseq/version.txt                       |    1 +
 .../fairseq_cli/__init__.py                   |    0
 .../fairseq_cli/eval_lm.py                    |  347 ++++
 .../fairseq_cli/generate.py                   |  417 ++++
 .../fairseq_cli/hydra_train.py                |   91 +
 .../fairseq_cli/hydra_validate.py             |  188 ++
 .../fairseq_cli/interactive.py                |  317 +++
 .../fairseq_cli/preprocess.py                 |  393 ++++
 .../fairseq_cli/score.py                      |  102 +
 .../fairseq_cli/train.py                      |  581 ++++++
 .../fairseq_cli/validate.py                   |  153 ++
 .../hubconf.py                                |   73 +
 .../pyproject.toml                            |   23 +
 .../release_utils.py                          |   72 +
 .../scripts/__init__.py                       |    0
 .../scripts/average_checkpoints.py            |  176 ++
 .../scripts/build_sym_alignment.py            |   97 +
 .../scripts/check_installation.py             |   36 +
 .../scripts/compare_namespaces.py             |   46 +
 .../scripts/compound_split_bleu.sh            |   20 +
 .../scripts/constraints/extract.py            |   90 +
 .../scripts/constraints/validate.py           |   34 +
 .../scripts/convert_dictionary.lua            |   34 +
 .../scripts/convert_model.lua                 |  108 +
 .../scripts/count_docs.py                     |   58 +
 .../scripts/read_binarized.py                 |   48 +
 .../scripts/rm_pt.py                          |  141 ++
 .../scripts/sacrebleu.sh                      |   27 +
 .../scripts/shard_docs.py                     |   54 +
 .../scripts/split_train_valid_docs.py         |   86 +
 .../scripts/spm_decode.py                     |   53 +
 .../scripts/spm_encode.py                     |  119 ++
 .../scripts/spm_train.py                      |   16 +
 .../scripts/test_fsdp.sh                      |   24 +
 .../setup.cfg                                 |    4 +
 .../setup.py                                  |  257 +++
 .../train.py                                  |   14 +
 511 files changed, 105259 insertions(+)
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/MANIFEST.in
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/config.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/fb_run_config/slurm.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_big.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_base.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_large.yaml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cuda_utils.cu
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/version.txt
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/pyproject.toml
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compound_split_bleu.sh
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_dictionary.lua
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_model.lua
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/sacrebleu.sh
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/test_fsdp.sh
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.cfg
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/MANIFEST.in b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/MANIFEST.in
new file mode 100644
index 0000000000..4f719da85c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/MANIFEST.in
@@ -0,0 +1 @@
+include fairseq/version.txt
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile
new file mode 100644
index 0000000000..c2f5b1a89c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = fairseq
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py
new file mode 100644
index 0000000000..0bc049f802
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# fairseq documentation build configuration file, created by
+# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import sys
+from fairseq import __version__
+
+
+# source code directory, relative to this file, for sphinx-autobuild
+sys.path.insert(0, os.path.abspath(".."))
+
+source_suffix = [".rst"]
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "sphinxarg.ext",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The master toctree document.
+master_doc = "index"
+
+# General information about the project.
+project = "fairseq"
+copyright = "Facebook AI Research (FAIR)"
+author = "Facebook AI Research (FAIR)"
+
+github_doc_root = "https://github.com/pytorch/fairseq/tree/main/docs/"
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = __version__
+# The full version, including alpha/beta/rc tags.
+release = __version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+highlight_language = "python"
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+html_theme = "classic"
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
+    "python": ("https://docs.python.org/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf
new file mode 100644
index 0000000000..526acffd32
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf
@@ -0,0 +1,2 @@
+[writers]
+option-limit=0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat
new file mode 100644
index 0000000000..baa9d02a79
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat
@@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=python -msphinx
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+set SPHINXPROJ=fairseq
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
+	echo.then set the SPHINXBUILD environment variable to point to the full
+	echo.path of the 'sphinx-build' executable. Alternatively you may add the
+	echo.Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
new file mode 100644
index 0000000000..822ff0aa50
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import os
+import sys
+
+try:
+    from .version import __version__  # noqa
+except ImportError:
+    version_txt = os.path.join(os.path.dirname(__file__), "version.txt")
+    with open(version_txt) as f:
+        __version__ = f.read().strip()
+
+__all__ = ["pdb"]
+
+# backwards compatibility to support `from fairseq.X import Y`
+from fairseq.distributed import utils as distributed_utils
+from fairseq.logging import meters, metrics, progress_bar  # noqa
+
+sys.modules["fairseq.distributed_utils"] = distributed_utils
+sys.modules["fairseq.meters"] = meters
+sys.modules["fairseq.metricrelease_utilss"] = metrics
+sys.modules["fairseq.progress_bar"] = progress_bar
+
+# initialize hydra
+from fairseq.dataclass.initialize import hydra_init
+
+hydra_init()
+
+import fairseq.criterions  # noqa
+import fairseq.distributed  # noqa
+import fairseq.models  # noqa
+import fairseq.modules  # noqa
+import fairseq.optim  # noqa
+import fairseq.optim.lr_scheduler  # noqa
+import fairseq.pdb  # noqa
+import fairseq.scoring  # noqa
+import fairseq.tasks  # noqa
+import fairseq.token_generation_constraints  # noqa
+
+import fairseq.benchmark  # noqa
+import fairseq.model_parallel  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
new file mode 100644
index 0000000000..0317d5c623
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# import models/tasks to register them
+from . import dummy_dataset, dummy_lm, dummy_masked_lm, dummy_model, dummy_mt  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
new file mode 100644
index 0000000000..a44847f250
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import random
+
+import torch
+from torch.utils import benchmark
+
+from fairseq.modules.multihead_attention import MultiheadAttention
+
+BATCH = [20, 41, 97]
+SEQ = 64
+EMB = 48
+HEADS = 4
+DROP = 0.1
+DEVICE = torch.device("cuda")
+ATTN_MASK_DTYPE = [torch.uint8, torch.bool, torch.float]
+KEY_PADDING_MASK_DTYPE = [torch.uint8, torch.bool]
+
+
+def _reset_seeds():
+    torch.manual_seed(0)
+    random.seed(0)
+
+
+def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int):
+    if to_dtype == torch.float:
+        mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool)
+        return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf"))
+    return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype)
+
+
+def benchmark_multihead_attention(
+    label="",
+    attn_dtype=torch.uint8,
+    key_padding_dtype=torch.uint8,
+    add_bias_kv=False,
+    add_zero_attn=False,
+    static_kv=False,
+    batch_size=20,
+    embedding=EMB,
+    seq_len=SEQ,
+    num_heads=HEADS,
+):
+
+    results = []
+    # device = torch.device("cuda")
+
+    xformers_att_config = '{"name": "scaled_dot_product"}'
+
+    attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len)
+    key_padding_mask = _get_mask(
+        to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len
+    )
+
+    q = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
+    k = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
+    v = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
+
+    _reset_seeds()
+
+    original_mha = MultiheadAttention(
+        embedding,
+        num_heads,
+        dropout=0.0,
+        xformers_att_config=None,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+    )
+
+    xformers_mha = MultiheadAttention(
+        embedding,
+        num_heads,
+        dropout=0.0,
+        xformers_att_config=xformers_att_config,
+        add_bias_kv=add_bias_kv,
+        add_zero_attn=add_zero_attn,
+    )
+
+    def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv):
+        original_mha(
+            query=q,
+            key=k,
+            value=v,
+            key_padding_mask=key_padding_mask,
+            attn_mask=attn_mask,
+            static_kv=static_kv,
+        )
+
+    def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv):
+        xformers_mha(
+            query=q,
+            key=k,
+            value=v,
+            key_padding_mask=key_padding_mask,
+            attn_mask=attn_mask,
+            static_kv=static_kv,
+        )
+
+    def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv):
+        output, _ = original_mha(
+            query=q,
+            key=k,
+            value=v,
+            key_padding_mask=key_padding_mask,
+            attn_mask=attn_mask,
+            static_kv=static_kv,
+        )
+        loss = torch.norm(output)
+        loss.backward()
+
+    def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv):
+        output, _ = xformers_mha(
+            query=q,
+            key=k,
+            value=v,
+            key_padding_mask=key_padding_mask,
+            attn_mask=attn_mask,
+            static_kv=static_kv,
+        )
+        loss = torch.norm(output)
+        loss.backward()
+
+    fns = [
+        original_bench_fw,
+        xformers_bench_fw,
+        original_bench_fw_bw,
+        xformers_bench_fw_bw,
+    ]
+
+    for fn in fns:
+        results.append(
+            benchmark.Timer(
+                stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)",
+                globals={
+                    "q": q,
+                    "k": k,
+                    "v": v,
+                    "key_padding_mask": key_padding_mask,
+                    "attn_mask": attn_mask,
+                    "static_kv": static_kv,
+                    "fn": fn,
+                },
+                label="multihead fw + bw",
+                sub_label=f"{fn.__name__}",
+                description=label,
+            ).blocked_autorange(min_run_time=1)
+        )
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+def run_benchmarks():
+    for attn_dtype, key_padding_dtype, add_bias_kv, add_zero_attn in itertools.product(
+        ATTN_MASK_DTYPE, KEY_PADDING_MASK_DTYPE, [True, False], [True, False]
+    ):
+        label = f"attn_dtype {attn_dtype}, key_padding_dtype {key_padding_dtype}, \
+            add_bias_kv {add_bias_kv}, add_zero_attn {add_zero_attn}"
+        benchmark_multihead_attention(
+            label=label,
+            attn_dtype=attn_dtype,
+            key_padding_dtype=key_padding_dtype,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+
+
+run_benchmarks()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py
new file mode 100644
index 0000000000..2f051754af
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py
@@ -0,0 +1,36 @@
+import numpy as np
+from fairseq.data import FairseqDataset
+
+
+class DummyDataset(FairseqDataset):
+    def __init__(self, batch, num_items, item_size):
+        super().__init__()
+        self.batch = batch
+        self.num_items = num_items
+        self.item_size = item_size
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return self.num_items
+
+    def collater(self, samples):
+        return self.batch
+
+    @property
+    def sizes(self):
+        return np.array([self.item_size] * self.num_items)
+
+    def num_tokens(self, index):
+        return self.item_size
+
+    def size(self, index):
+        return self.item_size
+
+    def ordered_indices(self):
+        return np.arange(self.num_items)
+
+    @property
+    def supports_prefetch(self):
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
new file mode 100644
index 0000000000..c6246a0c0e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from .dummy_dataset import DummyDataset
+from fairseq.data import Dictionary
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+from omegaconf import II
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DummyLMConfig(FairseqDataclass):
+    dict_size: int = 49996
+    dataset_size: int = 100000
+    tokens_per_sample: int = field(
+        default=512, metadata={"help": "max sequence length"}
+    )
+    add_bos_token: bool = False
+    batch_size: Optional[int] = II("dataset.batch_size")
+    max_tokens: Optional[int] = II("dataset.max_tokens")
+    max_target_positions: int = II("task.tokens_per_sample")
+
+
+@register_task("dummy_lm", dataclass=DummyLMConfig)
+class DummyLMTask(FairseqTask):
+    def __init__(self, cfg: DummyLMConfig):
+        super().__init__(cfg)
+
+        # load dictionary
+        self.dictionary = Dictionary()
+        for i in range(cfg.dict_size):
+            self.dictionary.add_symbol("word{}".format(i))
+        self.dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
+        logger.info("dictionary: {} types".format(len(self.dictionary)))
+
+        seq = torch.arange(cfg.tokens_per_sample + 1) + self.dictionary.pad() + 1
+
+        self.dummy_src = seq[:-1]
+        self.dummy_tgt = seq[1:]
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if self.cfg.batch_size is not None:
+            bsz = self.cfg.batch_size
+        else:
+            bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample)
+        self.datasets[split] = DummyDataset(
+            {
+                "id": 1,
+                "net_input": {
+                    "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]),
+                    "src_lengths": torch.full(
+                        (bsz,), self.cfg.tokens_per_sample, dtype=torch.long
+                    ),
+                },
+                "target": torch.stack([self.dummy_tgt for _ in range(bsz)]),
+                "nsentences": bsz,
+                "ntokens": bsz * self.cfg.tokens_per_sample,
+            },
+            num_items=self.cfg.dataset_size,
+            item_size=self.cfg.tokens_per_sample,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
new file mode 100644
index 0000000000..12b9c5d0f5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
@@ -0,0 +1,94 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from omegaconf import II
+
+from .dummy_dataset import DummyDataset
+from fairseq.data import Dictionary
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DummyMaskedLMConfig(FairseqDataclass):
+    dict_size: int = 49996
+    dataset_size: int = 100000
+    tokens_per_sample: int = field(
+        default=512,
+        metadata={
+            "help": "max number of total tokens over all"
+            " segments per sample for BERT dataset"
+        },
+    )
+    batch_size: Optional[int] = II("dataset.batch_size")
+    max_tokens: Optional[int] = II("dataset.max_tokens")
+    max_target_positions: int = II("task.tokens_per_sample")
+
+
+@register_task("dummy_masked_lm", dataclass=DummyMaskedLMConfig)
+class DummyMaskedLMTask(FairseqTask):
+    def __init__(self, cfg: DummyMaskedLMConfig):
+        super().__init__(cfg)
+
+        self.dictionary = Dictionary()
+        for i in range(cfg.dict_size):
+            self.dictionary.add_symbol("word{}".format(i))
+        logger.info("dictionary: {} types".format(len(self.dictionary)))
+        # add mask token
+        self.mask_idx = self.dictionary.add_symbol("<mask>")
+        self.dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
+
+        mask_idx = 0
+        pad_idx = 1
+        seq = torch.arange(cfg.tokens_per_sample) + pad_idx + 1
+        mask = torch.arange(2, cfg.tokens_per_sample, 7)  # ~15%
+        src = seq.clone()
+        src[mask] = mask_idx
+        tgt = torch.full_like(seq, pad_idx)
+        tgt[mask] = seq[mask]
+
+        self.dummy_src = src
+        self.dummy_tgt = tgt
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if self.cfg.batch_size is not None:
+            bsz = self.cfg.batch_size
+        else:
+            bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample)
+        self.datasets[split] = DummyDataset(
+            {
+                "id": 1,
+                "net_input": {
+                    "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]),
+                    "src_lengths": torch.full(
+                        (bsz,), self.cfg.tokens_per_sample, dtype=torch.long
+                    ),
+                },
+                "target": torch.stack([self.dummy_tgt for _ in range(bsz)]),
+                "nsentences": bsz,
+                "ntokens": bsz * self.cfg.tokens_per_sample,
+            },
+            num_items=self.cfg.dataset_size,
+            item_size=self.cfg.tokens_per_sample,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
new file mode 100644
index 0000000000..ff26e4fe65
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.data import Dictionary
+from fairseq.models import (
+    FairseqDecoder,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+
+
+@register_model("dummy_model")
+class DummyModel(FairseqLanguageModel):
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument("--num-layers", type=int, default=24)
+        parser.add_argument("--embed-dim", type=int, default=1024)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = DummyEncoder(
+            num_embed=len(task.target_dictionary),
+            embed_dim=args.embed_dim,
+            num_layers=args.num_layers,
+        )
+        return cls(args, encoder)
+
+    def forward(self, src_tokens, masked_tokens=None, **kwargs):
+        return self.decoder(src_tokens, masked_tokens=masked_tokens)
+
+
+class DummyEncoder(FairseqDecoder):
+    def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
+        super().__init__(Dictionary())
+        self.embed = nn.Embedding(
+            num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
+        )
+        self.layers_a = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.LayerNorm(embed_dim),
+                    nn.Linear(embed_dim, 3 * embed_dim),  # q, k, v input projection
+                    nn.Linear(3 * embed_dim, embed_dim),  # skip self-attention
+                    nn.Linear(embed_dim, embed_dim),  # output projection
+                    nn.Dropout(),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.layers_b = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.LayerNorm(embed_dim),
+                    nn.Linear(embed_dim, 4 * embed_dim),  # FFN
+                    nn.ReLU(),
+                    nn.Linear(4 * embed_dim, embed_dim),  # FFN
+                    nn.Dropout(0.1),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.out_proj = nn.Linear(embed_dim, num_embed)
+
+    def forward(self, tokens, masked_tokens=None):
+        x = self.embed(tokens)
+        for layer_a, layer_b in zip(self.layers_a, self.layers_b):
+            x = x + layer_a(x)
+            x = x + layer_b(x)
+        x = self.out_proj(x)
+        if masked_tokens is not None:
+            x = x[masked_tokens]
+        return (x,)
+
+    def max_positions(self):
+        return 1024
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+
+@register_model_architecture("dummy_model", "dummy_model")
+def base_architecture(args):
+    pass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
new file mode 100644
index 0000000000..28d78cffdb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+
+from fairseq.data import Dictionary, FairseqDataset
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("dummy_mt")
+class DummyMTTask(LegacyFairseqTask):
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("--dict-size", default=49996, type=int)
+        parser.add_argument("--dataset-size", default=100000, type=int)
+        parser.add_argument("--src-len", default=30, type=int)
+        parser.add_argument("--tgt-len", default=30, type=int)
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
+
+        self.dummy_src = torch.arange(args.src_len + 1) + dictionary.pad() + 1
+        self.dummy_tgt = torch.arange(args.tgt_len + 1) + dictionary.pad() + 1
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task."""
+        dictionary = Dictionary()
+        for i in range(args.dict_size):
+            dictionary.add_symbol("word{}".format(i))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+
+        args.max_source_positions = args.src_len + dictionary.pad() + 2
+        args.max_target_positions = args.tgt_len + dictionary.pad() + 2
+
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        item_size = max(self.args.src_len, self.args.tgt_len)
+        if self.args.batch_size is not None:
+            bsz = self.args.batch_size
+        else:
+            bsz = max(1, self.args.max_tokens // item_size)
+        tgt = torch.stack([self.dummy_tgt for _ in range(bsz)])
+        self.datasets[split] = DummyDataset(
+            {
+                "id": 1,
+                "net_input": {
+                    "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]),
+                    "src_lengths": torch.full(
+                        (bsz,), self.args.src_len, dtype=torch.long
+                    ),
+                    "prev_output_tokens": tgt.clone(),
+                },
+                "target": tgt,
+                "nsentences": bsz,
+                "ntokens": bsz * self.args.tgt_len,
+            },
+            num_items=self.args.dataset_size,
+            item_size=item_size,
+        )
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+
+class DummyDataset(FairseqDataset):
+    def __init__(self, batch, num_items, item_size):
+        super().__init__()
+        self.batch = batch
+        self.num_items = num_items
+        self.item_size = item_size
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return self.num_items
+
+    def collater(self, samples):
+        return self.batch
+
+    @property
+    def sizes(self):
+        return np.array([self.item_size] * self.num_items)
+
+    def num_tokens(self, index):
+        return self.item_size
+
+    def size(self, index):
+        return self.item_size
+
+    def ordered_indices(self):
+        return np.arange(self.num_items)
+
+    @property
+    def supports_prefetch(self):
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
new file mode 100644
index 0000000000..6f03d7a2cb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
@@ -0,0 +1,381 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import typing as tp
+from abc import ABC, abstractmethod
+from collections import Counter
+from dataclasses import dataclass
+from multiprocessing import Pool
+
+import torch
+
+from fairseq.data import Dictionary, indexed_dataset
+from fairseq.file_chunker_utils import Chunker, find_offsets
+from fairseq.file_io import PathManager
+from fairseq.tokenizer import tokenize_line
+
+logger = logging.getLogger("binarizer")
+
+
+@dataclass
+class BinarizeSummary:
+    """
+    Keep track of what's going on in the binarizer
+    """
+
+    num_seq: int = 0
+    replaced: tp.Optional[Counter] = None
+    num_tok: int = 0
+
+    @property
+    def num_replaced(self) -> int:
+        if self.replaced is None:
+            return 0
+        return sum(self.replaced.values())
+
+    @property
+    def replaced_percent(self) -> float:
+        return 100 * self.num_replaced / self.num_tok
+
+    def __str__(self) -> str:
+        base = f"{self.num_seq} sents, {self.num_tok} tokens"
+        if self.replaced is None:
+            return base
+
+        return f"{base}, {self.replaced_percent:.3}% replaced"
+
+    def merge(self, other: "BinarizeSummary"):
+        replaced = None
+        if self.replaced is not None:
+            replaced = self.replaced
+        if other.replaced is not None:
+            if replaced is None:
+                replaced = other.replaced
+            else:
+                replaced += other.replaced
+        self.replaced = replaced
+        self.num_seq += other.num_seq
+        self.num_tok += other.num_tok
+
+
+class Binarizer(ABC):
+    """
+    a binarizer describes how to take a string and build a tensor out of it
+    """
+
+    @abstractmethod
+    def binarize_line(
+        self,
+        line: str,
+        summary: BinarizeSummary,
+    ) -> torch.IntTensor:
+        ...
+
+
+def _worker_prefix(output_prefix: str, worker_id: int):
+    return f"{output_prefix}.pt{worker_id}"
+
+
+class FileBinarizer:
+    """
+    An file binarizer can take a file, tokenize it, and binarize each line to a tensor
+    """
+
+    @classmethod
+    def multiprocess_dataset(
+        cls,
+        input_file: str,
+        dataset_impl: str,
+        binarizer: Binarizer,
+        output_prefix: str,
+        vocab_size=None,
+        num_workers=1,
+    ) -> BinarizeSummary:
+        final_summary = BinarizeSummary()
+
+        offsets = find_offsets(input_file, num_workers)
+        # find_offsets returns a list of position [pos1, pos2, pos3, pos4] but we would want pairs:
+        # [(pos1, pos2), (pos2, pos3), (pos3, pos4)] to process the chunks with start/end info
+        # we zip the list with itself shifted by one to get all the pairs.
+        (first_chunk, *more_chunks) = zip(offsets, offsets[1:])
+        pool = None
+        if num_workers > 1:
+            pool = Pool(processes=num_workers - 1)
+            worker_results = [
+                pool.apply_async(
+                    cls._binarize_chunk_and_finalize,
+                    args=(
+                        binarizer,
+                        input_file,
+                        start_offset,
+                        end_offset,
+                        _worker_prefix(
+                            output_prefix,
+                            worker_id,
+                        ),
+                        dataset_impl,
+                    ),
+                    kwds={
+                        "vocab_size": vocab_size,
+                    }
+                    if vocab_size is not None
+                    else {},
+                )
+                for worker_id, (start_offset, end_offset) in enumerate(
+                    more_chunks, start=1
+                )
+            ]
+
+            pool.close()
+            pool.join()
+            for r in worker_results:
+                summ = r.get()
+                final_summary.merge(summ)
+
+        # do not close the bin file as we need to merge the worker results in
+        final_ds, summ = cls._binarize_file_chunk(
+            binarizer,
+            input_file,
+            offset_start=first_chunk[0],
+            offset_end=first_chunk[1],
+            output_prefix=output_prefix,
+            dataset_impl=dataset_impl,
+            vocab_size=vocab_size if vocab_size is not None else None,
+        )
+        final_summary.merge(summ)
+
+        if num_workers > 1:
+            for worker_id in range(1, num_workers):
+                # merge the worker outputs
+                worker_output_prefix = _worker_prefix(
+                    output_prefix,
+                    worker_id,
+                )
+                final_ds.merge_file_(worker_output_prefix)
+                try:
+                    os.remove(indexed_dataset.data_file_path(worker_output_prefix))
+                    os.remove(indexed_dataset.index_file_path(worker_output_prefix))
+                except Exception as e:
+                    logger.error(
+                        f"couldn't remove {worker_output_prefix}.*", exc_info=e
+                    )
+
+        #  now we can close the file
+        idx_file = indexed_dataset.index_file_path(output_prefix)
+        final_ds.finalize(idx_file)
+        return final_summary
+
+    @staticmethod
+    def _binarize_file_chunk(
+        binarizer: Binarizer,
+        filename: str,
+        offset_start: int,
+        offset_end: int,
+        output_prefix: str,
+        dataset_impl: str,
+        vocab_size=None,
+    ) -> tp.Tuple[tp.Any, BinarizeSummary]:  # (dataset builder, BinarizeSummary)
+        """
+        creates a dataset builder and append binarized items to it. This function does not
+        finalize the builder, this is useful if you want to do other things with your bin file
+        like appending/merging other files
+        """
+        bin_file = indexed_dataset.data_file_path(output_prefix)
+        ds = indexed_dataset.make_builder(
+            bin_file,
+            impl=dataset_impl,
+            vocab_size=vocab_size,
+        )
+        summary = BinarizeSummary()
+
+        with Chunker(
+            PathManager.get_local_path(filename), offset_start, offset_end
+        ) as line_iterator:
+            for line in line_iterator:
+                ds.add_item(binarizer.binarize_line(line, summary))
+
+        return ds, summary
+
+    @classmethod
+    def _binarize_chunk_and_finalize(
+        cls,
+        binarizer: Binarizer,
+        filename: str,
+        offset_start: int,
+        offset_end: int,
+        output_prefix: str,
+        dataset_impl: str,
+        vocab_size=None,
+    ):
+        """
+        same as above, but also finalizes the builder
+        """
+        ds, summ = cls._binarize_file_chunk(
+            binarizer,
+            filename,
+            offset_start,
+            offset_end,
+            output_prefix,
+            dataset_impl,
+            vocab_size=vocab_size,
+        )
+
+        idx_file = indexed_dataset.index_file_path(output_prefix)
+        ds.finalize(idx_file)
+
+        return summ
+
+
+class VocabularyDatasetBinarizer(Binarizer):
+    """
+    Takes a Dictionary/Vocabulary, assign ids to each
+    token using the dictionary encode_line function.
+    """
+
+    def __init__(
+        self,
+        dict: Dictionary,
+        tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line,
+        append_eos: bool = True,
+        reverse_order: bool = False,
+        already_numberized: bool = False,
+    ) -> None:
+        self.dict = dict
+        self.tokenize = tokenize
+        self.append_eos = append_eos
+        self.reverse_order = reverse_order
+        self.already_numberized = already_numberized
+        super().__init__()
+
+    def binarize_line(
+        self,
+        line: str,
+        summary: BinarizeSummary,
+    ):
+        if summary.replaced is None:
+            summary.replaced = Counter()
+
+        def replaced_consumer(word, idx):
+            if idx == self.dict.unk_index and word != self.dict.unk_word:
+                summary.replaced.update([word])
+
+        if self.already_numberized:
+            id_strings = line.strip().split()
+            id_list = [int(id_string) for id_string in id_strings]
+            if self.reverse_order:
+                id_list.reverse()
+            if self.append_eos:
+                id_list.append(self.dict.eos())
+            ids = torch.IntTensor(id_list)
+        else:
+            ids = self.dict.encode_line(
+                line=line,
+                line_tokenizer=self.tokenize,
+                add_if_not_exist=False,
+                consumer=replaced_consumer,
+                append_eos=self.append_eos,
+                reverse_order=self.reverse_order,
+            )
+
+        summary.num_seq += 1
+        summary.num_tok += len(ids)
+        return ids
+
+
+class AlignmentDatasetBinarizer(Binarizer):
+    """
+    binarize by parsing a set of alignments and packing
+    them in a tensor (see utils.parse_alignment)
+    """
+
+    def __init__(
+        self,
+        alignment_parser: tp.Callable[[str], torch.IntTensor],
+    ) -> None:
+        super().__init__()
+        self.alignment_parser = alignment_parser
+
+    def binarize_line(
+        self,
+        line: str,
+        summary: BinarizeSummary,
+    ):
+        ids = self.alignment_parser(line)
+        summary.num_seq += 1
+        summary.num_tok += len(ids)
+        return ids
+
+
+class LegacyBinarizer:
+    @classmethod
+    def binarize(
+        cls,
+        filename: str,
+        dico: Dictionary,
+        consumer: tp.Callable[[torch.IntTensor], None],
+        tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line,
+        append_eos: bool = True,
+        reverse_order: bool = False,
+        offset: int = 0,
+        end: int = -1,
+        already_numberized: bool = False,
+    ) -> tp.Dict[str, int]:
+        binarizer = VocabularyDatasetBinarizer(
+            dict=dico,
+            tokenize=tokenize,
+            append_eos=append_eos,
+            reverse_order=reverse_order,
+            already_numberized=already_numberized,
+        )
+        return cls._consume_file(
+            filename,
+            binarizer,
+            consumer,
+            offset_start=offset,
+            offset_end=end,
+        )
+
+    @classmethod
+    def binarize_alignments(
+        cls,
+        filename: str,
+        alignment_parser: tp.Callable[[str], torch.IntTensor],
+        consumer: tp.Callable[[torch.IntTensor], None],
+        offset: int = 0,
+        end: int = -1,
+    ) -> tp.Dict[str, int]:
+        binarizer = AlignmentDatasetBinarizer(alignment_parser)
+        return cls._consume_file(
+            filename,
+            binarizer,
+            consumer,
+            offset_start=offset,
+            offset_end=end,
+        )
+
+    @staticmethod
+    def _consume_file(
+        filename: str,
+        binarizer: Binarizer,
+        consumer: tp.Callable[[torch.IntTensor], None],
+        offset_start: int,
+        offset_end: int,
+    ) -> tp.Dict[str, int]:
+        summary = BinarizeSummary()
+
+        with Chunker(
+            PathManager.get_local_path(filename), offset_start, offset_end
+        ) as line_iterator:
+            for line in line_iterator:
+                consumer(binarizer.binarize_line(line, summary))
+
+        return {
+            "nseq": summary.num_seq,
+            "nunk": summary.num_replaced,
+            "ntok": summary.num_tok,
+            "replaced": summary.replaced,
+        }
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
new file mode 100644
index 0000000000..fb9a6679ba
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
@@ -0,0 +1,918 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import ast
+import collections
+import contextlib
+import inspect
+import logging
+import os
+import re
+import time
+import traceback
+from collections import OrderedDict
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+from fairseq.data import data_utils
+from fairseq.dataclass.configs import CheckpointConfig
+from fairseq.dataclass.utils import (
+    convert_namespace_to_omegaconf,
+    overwrite_args_by_name,
+)
+from fairseq.distributed.fully_sharded_data_parallel import FSDP, has_FSDP
+from fairseq.file_io import PathManager
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from omegaconf import DictConfig, OmegaConf, open_dict
+
+logger = logging.getLogger(__name__)
+
+
+def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
+    from fairseq import meters
+
+    # only one worker should attempt to create the required dir
+    if trainer.data_parallel_rank == 0:
+        os.makedirs(cfg.save_dir, exist_ok=True)
+
+    prev_best = getattr(save_checkpoint, "best", val_loss)
+    if val_loss is not None:
+        best_function = max if cfg.maximize_best_checkpoint_metric else min
+        save_checkpoint.best = best_function(val_loss, prev_best)
+
+    if cfg.no_save:
+        return None
+
+    trainer.consolidate_optimizer()  # TODO(SS): do we need this if no_save_optimizer_state
+
+    if not trainer.should_save_checkpoint_on_current_rank:
+        if trainer.always_call_state_dict_during_save_checkpoint:
+            trainer.state_dict()
+        return None
+
+    write_timer = meters.StopwatchMeter()
+    write_timer.start()
+
+    epoch = epoch_itr.epoch
+    end_of_epoch = epoch_itr.end_of_epoch()
+    updates = trainer.get_num_updates()
+
+    logger.info(f"Preparing to save checkpoint for epoch {epoch} @ {updates} updates")
+
+    def is_better(a, b):
+        return a >= b if cfg.maximize_best_checkpoint_metric else a <= b
+
+    suffix = trainer.checkpoint_suffix
+    checkpoint_conds = collections.OrderedDict()
+    checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
+        end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
+    )
+    checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
+        not end_of_epoch
+        and cfg.save_interval_updates > 0
+        and updates % cfg.save_interval_updates == 0
+    )
+    checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
+        not hasattr(save_checkpoint, "best")
+        or is_better(val_loss, save_checkpoint.best)
+    )
+    if val_loss is not None and cfg.keep_best_checkpoints > 0:
+        worst_best = getattr(save_checkpoint, "best", None)
+        chkpts = checkpoint_paths(
+            cfg.save_dir,
+            pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format(
+                cfg.best_checkpoint_metric, suffix
+            ),
+        )
+        if len(chkpts) > 0:
+            p = chkpts[-1] if cfg.maximize_best_checkpoint_metric else chkpts[0]
+            worst_best = float(p.rsplit("_")[-1].replace("{}.pt".format(suffix), ""))
+        # add random digits to resolve ties
+        with data_utils.numpy_seed(epoch, updates, val_loss):
+            rand_sfx = np.random.randint(0, cfg.keep_best_checkpoints)
+
+        checkpoint_conds[
+            "checkpoint.best_{}_{:.3f}{}{}.pt".format(
+                cfg.best_checkpoint_metric, val_loss, rand_sfx, suffix
+            )
+        ] = worst_best is None or is_better(val_loss, worst_best)
+    checkpoint_conds[
+        "checkpoint_last{}.pt".format(suffix)
+    ] = not cfg.no_last_checkpoints
+
+    extra_state = {"train_iterator": epoch_itr.state_dict(), "val_loss": val_loss}
+    if hasattr(save_checkpoint, "best"):
+        extra_state.update({"best": save_checkpoint.best})
+
+    checkpoints = [
+        os.path.join(cfg.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond
+    ]
+    saved_cp = None
+    if len(checkpoints) > 0 and trainer.should_save_checkpoint_on_current_rank:
+        saved_cp = trainer.save_checkpoint(checkpoints[0], extra_state)
+        for cp in checkpoints[1:]:
+            if cfg.write_checkpoints_asynchronously:
+                # TODO[ioPath]: Need to implement a delayed asynchronous
+                # file copying/moving feature.
+                logger.warning(
+                    f"ioPath is not copying {checkpoints[0]} to {cp} "
+                    "since async write mode is on."
+                )
+            else:
+                assert PathManager.copy(
+                    checkpoints[0], cp, overwrite=True
+                ), f"Failed to copy {checkpoints[0]} to {cp}"
+
+        write_timer.stop()
+        logger.info(
+            "Saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format(
+                checkpoints[0], epoch, updates, val_loss, write_timer.sum
+            )
+        )
+
+    if (
+        not end_of_epoch
+        and cfg.keep_interval_updates > 0
+        and trainer.should_save_checkpoint_on_current_rank
+    ):
+        # remove old checkpoints; checkpoints are sorted in descending order
+        if cfg.keep_interval_updates_pattern == -1:
+            checkpoints = checkpoint_paths(
+                cfg.save_dir, pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix)
+            )
+        else:
+            checkpoints = checkpoint_paths(
+                cfg.save_dir,
+                pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix),
+                keep_match=True,
+            )
+            checkpoints = [
+                x[0]
+                for x in checkpoints
+                if x[1] % cfg.keep_interval_updates_pattern != 0
+            ]
+
+        for old_chk in checkpoints[cfg.keep_interval_updates :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+
+    if cfg.keep_last_epochs > 0 and trainer.should_save_checkpoint_on_current_rank:
+        # remove old epoch checkpoints; checkpoints are sorted in descending order
+        checkpoints = checkpoint_paths(
+            cfg.save_dir, pattern=r"checkpoint(\d+){}\.pt".format(suffix)
+        )
+        for old_chk in checkpoints[cfg.keep_last_epochs :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+
+    if cfg.keep_best_checkpoints > 0 and trainer.should_save_checkpoint_on_current_rank:
+        # only keep the best N checkpoints according to validation metric
+        checkpoints = checkpoint_paths(
+            cfg.save_dir,
+            pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format(
+                cfg.best_checkpoint_metric, suffix
+            ),
+        )
+        if not cfg.maximize_best_checkpoint_metric:
+            checkpoints = checkpoints[::-1]
+        for old_chk in checkpoints[cfg.keep_best_checkpoints :]:
+            if os.path.lexists(old_chk):
+                os.remove(old_chk)
+            elif PathManager.exists(old_chk):
+                PathManager.rm(old_chk)
+
+    return saved_cp
+
+
+def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
+    """
+    Load a checkpoint and restore the training iterator.
+
+    *passthrough_args* will be passed through to
+    ``trainer.get_train_iterator``.
+    """
+
+    reset_optimizer = cfg.reset_optimizer
+    reset_lr_scheduler = cfg.reset_lr_scheduler
+    optimizer_overrides = ast.literal_eval(cfg.optimizer_overrides)
+    reset_meters = cfg.reset_meters
+    reset_dataloader = cfg.reset_dataloader
+
+    if cfg.finetune_from_model is not None and (
+        reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
+    ):
+        raise ValueError(
+            "--finetune-from-model can not be set together with either --reset-optimizer"
+            " or reset_lr_scheduler or reset_meters or reset_dataloader"
+        )
+
+    suffix = trainer.checkpoint_suffix
+    if (
+        cfg.restore_file == "checkpoint_last.pt"
+    ):  # default value of restore_file is 'checkpoint_last.pt'
+        checkpoint_path = os.path.join(
+            cfg.save_dir, "checkpoint_last{}.pt".format(suffix)
+        )
+        first_launch = not PathManager.exists(checkpoint_path)
+        if first_launch and getattr(cfg, "continue_once", None) is not None:
+            checkpoint_path = cfg.continue_once
+        elif cfg.finetune_from_model is not None and first_launch:
+            # if there is no last checkpoint to restore, start the finetune from pretrained model
+            # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc.
+            if PathManager.exists(cfg.finetune_from_model):
+                checkpoint_path = cfg.finetune_from_model
+                reset_optimizer = True
+                reset_lr_scheduler = True
+                reset_meters = True
+                reset_dataloader = True
+                logger.info(
+                    f"loading pretrained model from {checkpoint_path}: "
+                    "optimizer, lr scheduler, meters, dataloader will be reset"
+                )
+            else:
+                raise ValueError(
+                    f"--finetune-from-model {cfg.finetune_from_model} does not exist"
+                )
+    elif suffix is not None:
+        checkpoint_path = cfg.restore_file.replace(".pt", suffix + ".pt")
+    else:
+        checkpoint_path = cfg.restore_file
+
+    if cfg.restore_file != "checkpoint_last.pt" and cfg.finetune_from_model:
+        raise ValueError(
+            "--finetune-from-model and --restore-file (non-default value) "
+            "can not be specified together: " + str(cfg)
+        )
+
+    extra_state = trainer.load_checkpoint(
+        checkpoint_path,
+        reset_optimizer,
+        reset_lr_scheduler,
+        optimizer_overrides,
+        reset_meters=reset_meters,
+    )
+
+    if (
+        extra_state is not None
+        and "best" in extra_state
+        and not reset_optimizer
+        and not reset_meters
+    ):
+        save_checkpoint.best = extra_state["best"]
+
+    if extra_state is not None and not reset_dataloader:
+        # restore iterator from checkpoint
+        itr_state = extra_state["train_iterator"]
+        epoch_itr = trainer.get_train_iterator(
+            epoch=itr_state["epoch"], load_dataset=True, **passthrough_args
+        )
+        epoch_itr.load_state_dict(itr_state)
+    else:
+        epoch_itr = trainer.get_train_iterator(
+            epoch=1, load_dataset=True, **passthrough_args
+        )
+
+    trainer.lr_step(epoch_itr.epoch)
+
+    return extra_state, epoch_itr
+
+
+def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False):
+    """Loads a checkpoint to CPU (with upgrading for backward compatibility).
+
+    If doing single-GPU training or if the checkpoint is only being loaded by at
+    most one process on each node (current default behavior is for only rank 0
+    to read the checkpoint from disk), load_on_all_ranks should be False to
+    avoid errors from torch.distributed not having been initialized or
+    torch.distributed.barrier() hanging.
+
+    If all processes on each node may be loading the checkpoint
+    simultaneously, load_on_all_ranks should be set to True to avoid I/O
+    conflicts.
+
+    There's currently no support for > 1 but < all processes loading the
+    checkpoint on each node.
+    """
+    local_path = PathManager.get_local_path(path)
+    # The locally cached file returned by get_local_path() may be stale for
+    # remote files that are periodically updated/overwritten (ex:
+    # checkpoint_last.pt) - so we remove the local copy, sync across processes
+    # (if needed), and then download a fresh copy.
+    if local_path != path and PathManager.path_requires_pathmanager(path):
+        try:
+            os.remove(local_path)
+        except FileNotFoundError:
+            # With potentially multiple processes removing the same file, the
+            # file being missing is benign (missing_ok isn't available until
+            # Python 3.8).
+            pass
+        if load_on_all_ranks:
+            torch.distributed.barrier()
+        local_path = PathManager.get_local_path(path)
+
+    with open(local_path, "rb") as f:
+        state = torch.load(f, map_location=torch.device("cpu"))
+
+    if "args" in state and state["args"] is not None and arg_overrides is not None:
+        args = state["args"]
+        for arg_name, arg_val in arg_overrides.items():
+            setattr(args, arg_name, arg_val)
+
+    if "cfg" in state and state["cfg"] is not None:
+
+        # hack to be able to set Namespace in dict config. this should be removed when we update to newer
+        # omegaconf version that supports object flags, or when we migrate all existing models
+        from omegaconf import __version__ as oc_version
+        from omegaconf import _utils
+
+        if oc_version < "2.2":
+            old_primitive = _utils.is_primitive_type
+            _utils.is_primitive_type = lambda _: True
+
+            state["cfg"] = OmegaConf.create(state["cfg"])
+
+            _utils.is_primitive_type = old_primitive
+            OmegaConf.set_struct(state["cfg"], True)
+        else:
+            state["cfg"] = OmegaConf.create(state["cfg"], flags={"allow_objects": True})
+
+        if arg_overrides is not None:
+            overwrite_args_by_name(state["cfg"], arg_overrides)
+
+    state = _upgrade_state_dict(state)
+    return state
+
+
+def load_model_ensemble(
+    filenames,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    task=None,
+    strict=True,
+    suffix="",
+    num_shards=1,
+    state=None,
+):
+    """Loads an ensemble of models.
+
+    Args:
+        filenames (List[str]): checkpoint files to load
+        arg_overrides (Dict[str,Any], optional): override model args that
+            were used during model training
+        task (fairseq.tasks.FairseqTask, optional): task to use for loading
+    """
+    assert not (
+        strict and num_shards > 1
+    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
+    ensemble, args, _task = load_model_ensemble_and_task(
+        filenames,
+        arg_overrides,
+        task,
+        strict,
+        suffix,
+        num_shards,
+        state,
+    )
+    return ensemble, args
+
+
+def get_maybe_sharded_checkpoint_filename(
+    filename: str, suffix: str, shard_idx: int, num_shards: int
+) -> str:
+    orig_filename = filename
+    filename = filename.replace(".pt", suffix + ".pt")
+    fsdp_filename = filename[:-3] + f"-shard{shard_idx}.pt"
+    model_parallel_filename = orig_filename[:-3] + f"_part{shard_idx}.pt"
+    if PathManager.exists(fsdp_filename):
+        return fsdp_filename
+    elif num_shards > 1:
+        return model_parallel_filename
+    else:
+        return filename
+
+
+def load_model_ensemble_and_task(
+    filenames,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    task=None,
+    strict=True,
+    suffix="",
+    num_shards=1,
+    state=None,
+):
+    assert state is None or len(filenames) == 1
+
+    from fairseq import tasks
+
+    assert not (
+        strict and num_shards > 1
+    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
+    ensemble = []
+    cfg = None
+    for filename in filenames:
+        orig_filename = filename
+        model_shard_state = {"shard_weights": [], "shard_metadata": []}
+        assert num_shards > 0
+        st = time.time()
+        for shard_idx in range(num_shards):
+            filename = get_maybe_sharded_checkpoint_filename(
+                orig_filename, suffix, shard_idx, num_shards
+            )
+
+            if not PathManager.exists(filename):
+                raise IOError("Model file not found: {}".format(filename))
+            if state is None:
+                state = load_checkpoint_to_cpu(filename, arg_overrides)
+            if "args" in state and state["args"] is not None:
+                cfg = convert_namespace_to_omegaconf(state["args"])
+            elif "cfg" in state and state["cfg"] is not None:
+                cfg = state["cfg"]
+            else:
+                raise RuntimeError(
+                    f"Neither args nor cfg exist in state keys = {state.keys()}"
+                )
+
+            if task is None:
+                task = tasks.setup_task(cfg.task, from_checkpoint=True)
+
+            if "task_state" in state:
+                task.load_state_dict(state["task_state"])
+
+            argspec = inspect.getfullargspec(task.build_model)
+
+            if "fsdp_metadata" in state and num_shards > 1:
+                model_shard_state["shard_weights"].append(state["model"])
+                model_shard_state["shard_metadata"].append(state["fsdp_metadata"])
+                # check FSDP import before the code goes too far
+                if not has_FSDP:
+                    raise ImportError(
+                        "Cannot find FullyShardedDataParallel. "
+                        "Please install fairscale with: pip install fairscale"
+                    )
+                if shard_idx == num_shards - 1:
+                    consolidated_model_state = FSDP.consolidate_shard_weights(
+                        shard_weights=model_shard_state["shard_weights"],
+                        shard_metadata=model_shard_state["shard_metadata"],
+                    )
+                    if "from_checkpoint" in argspec.args:
+                        model = task.build_model(cfg.model, from_checkpoint=True)
+                    else:
+                        model = task.build_model(cfg.model)
+                    if (
+                        "optimizer_history" in state
+                        and len(state["optimizer_history"]) > 0
+                        and "num_updates" in state["optimizer_history"][-1]
+                    ):
+                        model.set_num_updates(
+                            state["optimizer_history"][-1]["num_updates"]
+                        )
+                    model.load_state_dict(
+                        consolidated_model_state, strict=strict, model_cfg=cfg.model
+                    )
+            else:
+                # model parallel checkpoint or unsharded checkpoint
+                # support old external tasks
+
+                if "from_checkpoint" in argspec.args:
+                    model = task.build_model(cfg.model, from_checkpoint=True)
+                else:
+                    model = task.build_model(cfg.model)
+                if (
+                    "optimizer_history" in state
+                    and len(state["optimizer_history"]) > 0
+                    and "num_updates" in state["optimizer_history"][-1]
+                ):
+                    model.set_num_updates(state["optimizer_history"][-1]["num_updates"])
+                model.load_state_dict(
+                    state["model"], strict=strict, model_cfg=cfg.model
+                )
+
+            # reset state so it gets loaded for the next model in ensemble
+            state = None
+            if shard_idx % 10 == 0 and shard_idx > 0:
+                elapsed = time.time() - st
+                logger.info(
+                    f"Loaded {shard_idx} shards in {elapsed:.2f}s, {elapsed / (shard_idx+1):.2f}s/shard"
+                )
+
+        # build model for ensemble
+        ensemble.append(model)
+    return ensemble, cfg, task
+
+
+def load_model_ensemble_and_task_from_hf_hub(
+    model_id,
+    cache_dir: Optional[str] = None,
+    arg_overrides: Optional[Dict[str, Any]] = None,
+    **kwargs: Any,
+):
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError:
+        raise ImportError(
+            "You need to install huggingface_hub to use `load_from_hf_hub`. "
+            "See https://pypi.org/project/huggingface-hub/ for installation."
+        )
+
+    library_name = "fairseq"
+    cache_dir = cache_dir or (Path.home() / ".cache" / library_name).as_posix()
+    cache_dir = snapshot_download(
+        model_id, cache_dir=cache_dir, library_name=library_name, **kwargs
+    )
+
+    _arg_overrides = arg_overrides or {}
+    _arg_overrides["data"] = cache_dir
+    return load_model_ensemble_and_task(
+        [p.as_posix() for p in Path(cache_dir).glob("*.pt")],
+        arg_overrides=_arg_overrides,
+    )
+
+
+def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt", keep_match=False):
+    """Retrieves all checkpoints found in `path` directory.
+
+    Checkpoints are identified by matching filename to the specified pattern. If
+    the pattern contains groups, the result will be sorted by the first group in
+    descending order.
+    """
+    pt_regexp = re.compile(pattern)
+    files = PathManager.ls(path)
+
+    entries = []
+    for i, f in enumerate(files):
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            idx = float(m.group(1)) if len(m.groups()) > 0 else i
+            entries.append((idx, m.group(0)))
+    if keep_match:
+        return [(os.path.join(path, x[1]), x[0]) for x in sorted(entries, reverse=True)]
+    else:
+        return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)]
+
+
+def torch_persistent_save(obj, filename, async_write: bool = False):
+    if async_write:
+        with PathManager.opena(filename, "wb") as f:
+            _torch_persistent_save(obj, f)
+    else:
+        if PathManager.supports_rename(filename):
+            # do atomic save
+            with PathManager.open(filename + ".tmp", "wb") as f:
+                _torch_persistent_save(obj, f)
+            PathManager.rename(filename + ".tmp", filename)
+        else:
+            # fallback to non-atomic save
+            with PathManager.open(filename, "wb") as f:
+                _torch_persistent_save(obj, f)
+
+
+def _torch_persistent_save(obj, f):
+    if isinstance(f, str):
+        with PathManager.open(f, "wb") as h:
+            torch_persistent_save(obj, h)
+        return
+    for i in range(3):
+        try:
+            return torch.save(obj, f)
+        except Exception:
+            if i == 2:
+                logger.error(traceback.format_exc())
+                raise
+            else:
+                time.sleep(2.5)
+
+
+def _upgrade_state_dict(state):
+    """Helper for upgrading old model checkpoints."""
+
+    # add optimizer_history
+    if "optimizer_history" not in state:
+        state["optimizer_history"] = [
+            {"criterion_name": "CrossEntropyCriterion", "best_loss": state["best_loss"]}
+        ]
+        state["last_optimizer_state"] = state["optimizer"]
+        del state["optimizer"]
+        del state["best_loss"]
+    # move extra_state into sub-dictionary
+    if "epoch" in state and "extra_state" not in state:
+        state["extra_state"] = {
+            "epoch": state["epoch"],
+            "batch_offset": state["batch_offset"],
+            "val_loss": state["val_loss"],
+        }
+        del state["epoch"]
+        del state["batch_offset"]
+        del state["val_loss"]
+    # reduce optimizer history's memory usage (only keep the last state)
+    if "optimizer" in state["optimizer_history"][-1]:
+        state["last_optimizer_state"] = state["optimizer_history"][-1]["optimizer"]
+        for optim_hist in state["optimizer_history"]:
+            del optim_hist["optimizer"]
+    # record the optimizer class name
+    if "optimizer_name" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["optimizer_name"] = "FairseqNAG"
+    # move best_loss into lr_scheduler_state
+    if "lr_scheduler_state" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["lr_scheduler_state"] = {
+            "best": state["optimizer_history"][-1]["best_loss"]
+        }
+        del state["optimizer_history"][-1]["best_loss"]
+    # keep track of number of updates
+    if "num_updates" not in state["optimizer_history"][-1]:
+        state["optimizer_history"][-1]["num_updates"] = 0
+    # use stateful training data iterator
+    if "train_iterator" not in state["extra_state"]:
+        state["extra_state"]["train_iterator"] = {
+            "epoch": state["extra_state"].get("epoch", 0),
+            "iterations_in_epoch": state["extra_state"].get("batch_offset", 0),
+        }
+
+    # backward compatibility, cfg updates
+    if "args" in state and state["args"] is not None:
+        # old model checkpoints may not have separate source/target positions
+        if hasattr(state["args"], "max_positions") and not hasattr(
+            state["args"], "max_source_positions"
+        ):
+            state["args"].max_source_positions = state["args"].max_positions
+            state["args"].max_target_positions = state["args"].max_positions
+        # default to translation task
+        if not hasattr(state["args"], "task"):
+            state["args"].task = "translation"
+        # --raw-text and --lazy-load are deprecated
+        if getattr(state["args"], "raw_text", False):
+            state["args"].dataset_impl = "raw"
+        elif getattr(state["args"], "lazy_load", False):
+            state["args"].dataset_impl = "lazy"
+        # epochs start at 1
+        if state["extra_state"]["train_iterator"] is not None:
+            state["extra_state"]["train_iterator"]["epoch"] = max(
+                state["extra_state"]["train_iterator"].get("epoch", 1), 1
+            )
+        # --remove-bpe ==> --postprocess
+        if hasattr(state["args"], "remove_bpe"):
+            state["args"].post_process = state["args"].remove_bpe
+        # --min-lr ==> --stop-min-lr
+        if hasattr(state["args"], "min_lr"):
+            state["args"].stop_min_lr = state["args"].min_lr
+            del state["args"].min_lr
+        # binary_cross_entropy / kd_binary_cross_entropy => wav2vec criterion
+        if hasattr(state["args"], "criterion") and state["args"].criterion in [
+            "binary_cross_entropy",
+            "kd_binary_cross_entropy",
+        ]:
+            state["args"].criterion = "wav2vec"
+        # remove log_keys if it's None (criteria will supply a default value of [])
+        if hasattr(state["args"], "log_keys") and state["args"].log_keys is None:
+            delattr(state["args"], "log_keys")
+        # speech_pretraining => audio pretraining
+        if (
+            hasattr(state["args"], "task")
+            and state["args"].task == "speech_pretraining"
+        ):
+            state["args"].task = "audio_pretraining"
+        # audio_cpc => wav2vec
+        if hasattr(state["args"], "arch") and state["args"].arch == "audio_cpc":
+            state["args"].arch = "wav2vec"
+        # convert legacy float learning rate to List[float]
+        if hasattr(state["args"], "lr") and isinstance(state["args"].lr, float):
+            state["args"].lr = [state["args"].lr]
+        # convert task data arg to a string instead of List[string]
+        if (
+            hasattr(state["args"], "data")
+            and isinstance(state["args"].data, list)
+            and len(state["args"].data) > 0
+        ):
+            state["args"].data = state["args"].data[0]
+
+        state["cfg"] = convert_namespace_to_omegaconf(state["args"])
+
+    if "cfg" in state and state["cfg"] is not None:
+        cfg = state["cfg"]
+        with open_dict(cfg):
+            # any upgrades for Hydra-based configs
+            if (
+                "task" in cfg
+                and "eval_wer_config" in cfg.task
+                and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
+            ):
+                cfg.task.eval_wer_config.print_alignment = "hard"
+            if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
+                cfg.generation.print_alignment = (
+                    "hard" if cfg.generation.print_alignment else None
+                )
+            if (
+                "model" in cfg
+                and "w2v_args" in cfg.model
+                and cfg.model.w2v_args is not None
+                and (
+                    hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args
+                )
+                and hasattr(cfg.model.w2v_args.task, "eval_wer_config")
+                and cfg.model.w2v_args.task.eval_wer_config is not None
+                and isinstance(
+                    cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
+                )
+            ):
+                cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard"
+
+    return state
+
+
+def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
+    """Prune the given state_dict if desired for LayerDrop
+    (https://arxiv.org/abs/1909.11556).
+
+    Training with LayerDrop allows models to be robust to pruning at inference
+    time. This function prunes state_dict to allow smaller models to be loaded
+    from a larger model and re-maps the existing state_dict for this to occur.
+
+    It's called by functions that load models from checkpoints and does not
+    need to be called directly.
+    """
+    arch = None
+    if model_cfg is not None:
+        arch = (
+            model_cfg._name
+            if isinstance(model_cfg, DictConfig)
+            else getattr(model_cfg, "arch", None)
+        )
+
+    if not model_cfg or arch is None or arch == "ptt_transformer":
+        # args should not be none, but don't crash if it is.
+        return state_dict
+
+    encoder_layers_to_keep = getattr(model_cfg, "encoder_layers_to_keep", None)
+    decoder_layers_to_keep = getattr(model_cfg, "decoder_layers_to_keep", None)
+
+    if not encoder_layers_to_keep and not decoder_layers_to_keep:
+        return state_dict
+
+    # apply pruning
+    logger.info(
+        "Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop"
+    )
+
+    def create_pruning_pass(layers_to_keep, layer_name):
+        keep_layers = sorted(
+            int(layer_string) for layer_string in layers_to_keep.split(",")
+        )
+        mapping_dict = {}
+        for i in range(len(keep_layers)):
+            mapping_dict[str(keep_layers[i])] = str(i)
+
+        regex = re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name))
+        return {"substitution_regex": regex, "mapping_dict": mapping_dict}
+
+    pruning_passes = []
+    if encoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
+    if decoder_layers_to_keep:
+        pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
+
+    new_state_dict = {}
+    for layer_name in state_dict.keys():
+        match = re.search(r"\.layers\.(\d+)\.", layer_name)
+        # if layer has no number in it, it is a supporting layer, such as an
+        # embedding
+        if not match:
+            new_state_dict[layer_name] = state_dict[layer_name]
+            continue
+
+        # otherwise, layer should be pruned.
+        original_layer_number = match.group(1)
+        # figure out which mapping dict to replace from
+        for pruning_pass in pruning_passes:
+            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass[
+                "substitution_regex"
+            ].search(layer_name):
+                new_layer_number = pruning_pass["mapping_dict"][original_layer_number]
+                substitution_match = pruning_pass["substitution_regex"].search(
+                    layer_name
+                )
+                new_state_key = (
+                    layer_name[: substitution_match.start(1)]
+                    + new_layer_number
+                    + layer_name[substitution_match.end(1) :]
+                )
+                new_state_dict[new_state_key] = state_dict[layer_name]
+
+    # Since layers are now pruned, *_layers_to_keep are no longer needed.
+    # This is more of "It would make it work fix" rather than a proper fix.
+    if isinstance(model_cfg, DictConfig):
+        context = open_dict(model_cfg)
+    else:
+        context = contextlib.ExitStack()
+    with context:
+        if hasattr(model_cfg, "encoder_layers_to_keep"):
+            model_cfg.encoder_layers_to_keep = None
+        if hasattr(model_cfg, "decoder_layers_to_keep"):
+            model_cfg.decoder_layers_to_keep = None
+
+    return new_state_dict
+
+
+def load_pretrained_component_from_model(
+    component: Union[FairseqEncoder, FairseqDecoder],
+    checkpoint: str,
+    strict: bool = True,
+):
+    """
+    Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
+    provided `component` object. If state_dict fails to load, there may be a
+    mismatch in the architecture of the corresponding `component` found in the
+    `checkpoint` file.
+    """
+    if not PathManager.exists(checkpoint):
+        raise IOError("Model file not found: {}".format(checkpoint))
+    state = load_checkpoint_to_cpu(checkpoint)
+    if isinstance(component, FairseqEncoder):
+        component_type = "encoder"
+    elif isinstance(component, FairseqDecoder):
+        component_type = "decoder"
+    else:
+        raise ValueError(
+            "component to load must be either a FairseqEncoder or "
+            "FairseqDecoder. Loading other component types are not supported."
+        )
+    component_state_dict = OrderedDict()
+    for key in state["model"].keys():
+        if key.startswith(component_type):
+            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
+            component_subkey = key[len(component_type) + 1 :]
+            component_state_dict[component_subkey] = state["model"][key]
+    component.load_state_dict(component_state_dict, strict=strict)
+    return component
+
+
+def verify_checkpoint_directory(save_dir: str) -> None:
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    temp_file_path = os.path.join(save_dir, "dummy")
+    try:
+        with open(temp_file_path, "w"):
+            pass
+    except OSError as e:
+        logger.warning(
+            "Unable to access checkpoint save directory: {}".format(save_dir)
+        )
+        raise e
+    else:
+        os.remove(temp_file_path)
+
+
+def save_ema_as_checkpoint(src_path, dst_path):
+    state = load_ema_from_checkpoint(src_path)
+    torch_persistent_save(state, dst_path)
+
+
+def load_ema_from_checkpoint(fpath):
+    """Loads exponential moving averaged (EMA) checkpoint from input and
+    returns a model with ema weights.
+
+    Args:
+      fpath: A string path of checkpoint to load from.
+
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = collections.OrderedDict()
+    new_state = None
+
+    with PathManager.open(fpath, "rb") as f:
+        new_state = torch.load(
+            f,
+            map_location=(
+                lambda s, _: torch.serialization.default_restore_location(s, "cpu")
+            ),
+        )
+
+        # EMA model is stored in a separate "extra state"
+        model_params = new_state["extra_state"]["ema"]
+
+        for key in list(model_params.keys()):
+            p = model_params[key]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if key not in params_dict:
+                params_dict[key] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                raise ValueError("Key {} is repeated in EMA model params.".format(key))
+
+        if len(params_dict) == 0:
+            raise ValueError(
+                f"Input checkpoint path '{fpath}' does not contain "
+                "ema model weights, is this model trained with EMA?"
+            )
+
+    new_state["model"] = params_dict
+    return new_state
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp
new file mode 100644
index 0000000000..1a5a1061f3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+C++ code for solving the linear assignment problem.
+Based on the Auction Algorithm from
+https://dspace.mit.edu/bitstream/handle/1721.1/3265/P-2108-26912652.pdf and the
+implementation from: https://github.com/bkj/auction-lap Adapted to be more
+efficient when each worker is looking for k jobs instead of 1.
+*/
+#include <torch/extension.h>
+#include <iostream>
+using namespace torch::indexing;
+torch::Tensor balanced_assignment(torch::Tensor job_and_worker_to_score) {
+  int max_iterations = 100;
+  torch::Tensor epsilon =
+      (job_and_worker_to_score.max() - job_and_worker_to_score.min()) / 50;
+  epsilon.clamp_min_(1e-04);
+  torch::Tensor worker_and_job_to_score =
+      job_and_worker_to_score.detach().transpose(0, 1).contiguous();
+  int num_workers = worker_and_job_to_score.size(0);
+  int num_jobs = worker_and_job_to_score.size(1);
+  auto device = worker_and_job_to_score.device();
+  int jobs_per_worker = num_jobs / num_workers;
+  torch::Tensor value = worker_and_job_to_score.clone();
+  int counter = 0;
+  torch::Tensor max_value = worker_and_job_to_score.max();
+
+  torch::Tensor bid_indices;
+  torch::Tensor cost = worker_and_job_to_score.new_zeros({1, num_jobs});
+  torch::Tensor bids =
+      worker_and_job_to_score.new_empty({num_workers, num_jobs});
+  torch::Tensor bid_increments =
+      worker_and_job_to_score.new_empty({num_workers, jobs_per_worker});
+  torch::Tensor top_values =
+      worker_and_job_to_score.new_empty({num_workers, jobs_per_worker + 1});
+  torch::Tensor high_bids = worker_and_job_to_score.new_empty({num_jobs});
+
+  torch::Tensor top_index = top_values.to(torch::kLong);
+  torch::Tensor high_bidders = top_index.new_empty({num_jobs});
+  torch::Tensor have_bids = high_bidders.to(torch::kBool);
+  torch::Tensor jobs_indices =
+      torch::arange({num_jobs}, torch::dtype(torch::kLong).device(device));
+  torch::Tensor true_tensor =
+      torch::ones({1}, torch::dtype(torch::kBool).device(device));
+
+  while (true) {
+    bids.zero_();
+    torch::topk_out(top_values, top_index, value, jobs_per_worker + 1, 1);
+
+    // Each worker bids the difference in value between that job and the k+1th
+    // job
+    torch::sub_out(
+        bid_increments,
+        top_values.index({Slice(None, None), Slice(0, jobs_per_worker)}),
+        top_values.index({Slice(None, None), jobs_per_worker}).unsqueeze(1));
+
+    bid_increments.add_(epsilon);
+    bids.scatter_(
+        1,
+        top_index.index({Slice(None, None), Slice(0, jobs_per_worker)}),
+        bid_increments);
+
+    if (counter < max_iterations && counter > 0) {
+      // Put in a minimal bid to retain items from the last round if no-one else
+      // bids for them this round
+      bids.view(-1).index_put_({bid_indices}, epsilon);
+    }
+
+    // Find the highest bidding worker per job
+    torch::max_out(high_bids, high_bidders, bids, 0);
+    torch::gt_out(have_bids, high_bids, 0);
+
+    if (have_bids.all().item<bool>()) {
+      // All jobs were bid for
+      break;
+    }
+
+    // Make popular items more expensive
+    cost.add_(high_bids);
+    torch::sub_out(value, worker_and_job_to_score, cost);
+
+    bid_indices = ((high_bidders * num_jobs) + jobs_indices).index({have_bids});
+
+    if (counter < max_iterations) {
+      // Make sure that this item will be in the winning worker's top-k next
+      // time.
+      value.view(-1).index_put_({bid_indices}, max_value);
+    } else {
+      // Suboptimal approximation that converges quickly from current solution
+      value.view(-1).index_put_(
+          {bid_indices}, worker_and_job_to_score.view(-1).index({bid_indices}));
+    }
+
+    counter += 1;
+  }
+
+  return top_index.index({Slice(None, None), Slice(0, jobs_per_worker)})
+      .reshape(-1);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("balanced_assignment", &balanced_assignment, "Balanced Assignment");
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp
new file mode 100644
index 0000000000..939d9e1174
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <array>
+#include <cstdio>
+#include <cstring>
+#include <map>
+
+// NOLINTNEXTLINE
+typedef struct {
+  size_t reflen;
+  size_t predlen;
+  size_t match1;
+  size_t count1;
+  size_t match2;
+  size_t count2;
+  size_t match3;
+  size_t count3;
+  size_t match4;
+  size_t count4;
+} bleu_stat;
+
+// left trim (remove pad)
+void bleu_ltrim(size_t* len, int** sent, int pad) {
+  size_t start = 0;
+  while (start < *len) {
+    if (*(*sent + start) != pad) {
+      break;
+    }
+    start++;
+  }
+  *sent += start;
+  *len -= start;
+}
+
+// right trim remove (eos)
+void bleu_rtrim(size_t* len, int** sent, int pad, int eos) {
+  size_t end = *len - 1;
+  while (end > 0) {
+    if (*(*sent + end) != eos && *(*sent + end) != pad) {
+      break;
+    }
+    end--;
+  }
+  *len = end + 1;
+}
+
+// left and right trim
+void bleu_trim(size_t* len, int** sent, int pad, int eos) {
+  bleu_ltrim(len, sent, pad);
+  bleu_rtrim(len, sent, pad, eos);
+}
+
+size_t bleu_hash(int len, int* data) {
+  size_t h = 14695981039346656037ul;
+  size_t prime = 0x100000001b3;
+  char* b = (char*)data;
+  size_t blen = sizeof(int) * len;
+
+  while (blen-- > 0) {
+    h ^= *b++;
+    h *= prime;
+  }
+
+  return h;
+}
+
+void bleu_addngram(
+    size_t* ntotal,
+    size_t* nmatch,
+    size_t n,
+    size_t reflen,
+    int* ref,
+    size_t predlen,
+    int* pred) {
+  if (predlen < n) {
+    return;
+  }
+
+  predlen = predlen - n + 1;
+  (*ntotal) += predlen;
+
+  if (reflen < n) {
+    return;
+  }
+
+  reflen = reflen - n + 1;
+
+  std::map<size_t, size_t> count;
+  while (predlen > 0) {
+    size_t w = bleu_hash(n, pred++);
+    count[w]++;
+    predlen--;
+  }
+
+  while (reflen > 0) {
+    size_t w = bleu_hash(n, ref++);
+    if (count[w] > 0) {
+      (*nmatch)++;
+      count[w] -= 1;
+    }
+    reflen--;
+  }
+}
+
+extern "C" {
+
+#ifdef _WIN64
+__declspec(dllexport)
+#endif
+    void bleu_zero_init(bleu_stat* stat) {
+  std::memset(stat, 0, sizeof(bleu_stat));
+}
+
+#ifdef _WIN64
+__declspec(dllexport)
+#endif
+    void bleu_one_init(bleu_stat* stat) {
+  bleu_zero_init(stat);
+  stat->count1 = 0;
+  stat->count2 = 1;
+  stat->count3 = 1;
+  stat->count4 = 1;
+  stat->match1 = 0;
+  stat->match2 = 1;
+  stat->match3 = 1;
+  stat->match4 = 1;
+}
+
+#ifdef _WIN64
+__declspec(dllexport)
+#endif
+    void bleu_add(
+        bleu_stat* stat,
+        size_t reflen,
+        int* ref,
+        size_t predlen,
+        int* pred,
+        int pad,
+        int eos) {
+
+  bleu_trim(&reflen, &ref, pad, eos);
+  bleu_trim(&predlen, &pred, pad, eos);
+  stat->reflen += reflen;
+  stat->predlen += predlen;
+
+  bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred);
+  bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred);
+}
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp
new file mode 100644
index 0000000000..35288b3177
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <Python.h>
+
+static PyMethodDef method_def[] = {{NULL, NULL, 0, NULL}}; // NOLINT
+
+static struct PyModuleDef module_def = {
+    PyModuleDef_HEAD_INIT,
+    "libbleu", /* name of module */
+    // NOLINTNEXTLINE
+    NULL, /* module documentation, may be NULL */
+    -1, /* size of per-interpreter state of the module,
+           or -1 if the module keeps state in global variables. */
+    method_def}; // NOLINT
+
+#if PY_MAJOR_VERSION == 2
+PyMODINIT_FUNC init_libbleu()
+#else
+PyMODINIT_FUNC PyInit_libbleu()
+#endif
+{
+  PyObject* m = PyModule_Create(&module_def);
+  if (!m) {
+    return NULL;
+  }
+  return m;
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp
new file mode 100644
index 0000000000..9ffb60569d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp
@@ -0,0 +1,231 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pybind11/detail/common.h>
+#include <pybind11/pybind11.h>
+#include <torch/torch.h> // @manual=//caffe2:torch_extension
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <new>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace ::std;
+
+vector<vector<uint32_t>> edit_distance2_with_dp(
+    vector<uint32_t>& x,
+    vector<uint32_t>& y) {
+  uint32_t lx = x.size();
+  uint32_t ly = y.size();
+  vector<vector<uint32_t>> d(lx + 1, vector<uint32_t>(ly + 1));
+  for (uint32_t i = 0; i < lx + 1; i++) {
+    d[i][0] = i;
+  }
+  for (uint32_t j = 0; j < ly + 1; j++) {
+    d[0][j] = j;
+  }
+  for (uint32_t i = 1; i < lx + 1; i++) {
+    for (uint32_t j = 1; j < ly + 1; j++) {
+      d[i][j] =
+          min(min(d[i - 1][j], d[i][j - 1]) + 1,
+              d[i - 1][j - 1] + 2 * (x.at(i - 1) == y.at(j - 1) ? 0 : 1));
+    }
+  }
+  return d;
+}
+
+vector<vector<uint32_t>> edit_distance2_backtracking(
+    vector<vector<uint32_t>>& d,
+    vector<uint32_t>& x,
+    vector<uint32_t>& y,
+    uint32_t terminal_symbol) {
+  vector<uint32_t> seq;
+  vector<vector<uint32_t>> edit_seqs(x.size() + 2, vector<uint32_t>());
+  /*
+  edit_seqs:
+  0~x.size() cell is the insertion sequences
+  last cell is the delete sequence
+  */
+
+  if (x.size() == 0) {
+    edit_seqs.at(0) = y;
+    return edit_seqs;
+  }
+
+  uint32_t i = d.size() - 1;
+  uint32_t j = d.at(0).size() - 1;
+
+  while ((i >= 0) && (j >= 0)) {
+    if ((i == 0) && (j == 0)) {
+      break;
+    }
+
+    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
+      seq.push_back(1); // insert
+      seq.push_back(y.at(j - 1));
+      j--;
+    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
+      seq.push_back(2); // delete
+      seq.push_back(x.at(i - 1));
+      i--;
+    } else {
+      seq.push_back(3); // keep
+      seq.push_back(x.at(i - 1));
+      i--;
+      j--;
+    }
+  }
+
+  uint32_t prev_op, op, s, word;
+  prev_op = 0, s = 0;
+  for (uint32_t k = 0; k < seq.size() / 2; k++) {
+    op = seq.at(seq.size() - 2 * k - 2);
+    word = seq.at(seq.size() - 2 * k - 1);
+    if (prev_op != 1) {
+      s++;
+    }
+    if (op == 1) // insert
+    {
+      edit_seqs.at(s - 1).push_back(word);
+    } else if (op == 2) // delete
+    {
+      edit_seqs.at(x.size() + 1).push_back(1);
+    } else {
+      edit_seqs.at(x.size() + 1).push_back(0);
+    }
+
+    prev_op = op;
+  }
+
+  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
+    if (edit_seqs[k].size() == 0) {
+      edit_seqs[k].push_back(terminal_symbol);
+    }
+  }
+  return edit_seqs;
+}
+
+vector<vector<uint32_t>> edit_distance2_backtracking_with_delete(
+    vector<vector<uint32_t>>& d,
+    vector<uint32_t>& x,
+    vector<uint32_t>& y,
+    uint32_t terminal_symbol,
+    uint32_t deletion_symbol) {
+  vector<uint32_t> seq;
+  vector<vector<uint32_t>> edit_seqs(x.size() + 1, vector<uint32_t>());
+  /*
+  edit_seqs:
+  0~x.size() cell is the insertion sequences
+  last cell is the delete sequence
+  */
+
+  if (x.size() == 0) {
+    edit_seqs.at(0) = y;
+    return edit_seqs;
+  }
+
+  uint32_t i = d.size() - 1;
+  uint32_t j = d.at(0).size() - 1;
+
+  while ((i >= 0) && (j >= 0)) {
+    if ((i == 0) && (j == 0)) {
+      break;
+    }
+
+    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
+      seq.push_back(1); // insert
+      seq.push_back(y.at(j - 1));
+      j--;
+    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
+      seq.push_back(2); // delete
+      seq.push_back(x.at(i - 1));
+      i--;
+    } else {
+      seq.push_back(3); // keep
+      seq.push_back(x.at(i - 1));
+      i--;
+      j--;
+    }
+  }
+
+  uint32_t prev_op, op, s, word;
+  prev_op = 0, s = 0;
+  for (uint32_t k = 0; k < seq.size() / 2; k++) {
+    op = seq.at(seq.size() - 2 * k - 2);
+    word = seq.at(seq.size() - 2 * k - 1);
+    if (prev_op != 1) {
+      s++;
+    }
+    if (op == 1) // insert
+    {
+      edit_seqs.at(s - 1).push_back(word);
+    } else if (op == 2) // delete
+    {
+      edit_seqs.at(s - 1).push_back(deletion_symbol);
+    }
+
+    prev_op = op;
+  }
+
+  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
+    if (edit_seqs.at(k).size() == 0) {
+      edit_seqs.at(k).push_back(terminal_symbol);
+    }
+  }
+  return edit_seqs;
+}
+
+vector<uint32_t> compute_ed2(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys) {
+  vector<uint32_t> distances(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    distances.at(i) = d.at(xs.at(i).size()).at(ys.at(i).size());
+  }
+  return distances;
+}
+
+vector<vector<vector<uint32_t>>> suggested_ed2_path(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys,
+    uint32_t terminal_symbol) {
+  vector<vector<vector<uint32_t>>> seq(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    seq.at(i) =
+        edit_distance2_backtracking(d, xs.at(i), ys.at(i), terminal_symbol);
+  }
+  return seq;
+}
+
+vector<vector<vector<uint32_t>>> suggested_ed2_path_with_delete(
+    vector<vector<uint32_t>>& xs,
+    vector<vector<uint32_t>>& ys,
+    uint32_t terminal_symbol,
+    uint32_t deletion_symbol) {
+  vector<vector<vector<uint32_t>>> seq(xs.size());
+  for (uint32_t i = 0; i < xs.size(); i++) {
+    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
+    seq.at(i) = edit_distance2_backtracking_with_delete(
+        d, xs.at(i), ys.at(i), terminal_symbol, deletion_symbol);
+  }
+  return seq;
+}
+
+PYBIND11_MODULE(libnat, m) {
+  m.def("compute_ed2", &compute_ed2, "compute_ed2");
+  m.def("suggested_ed2_path", &suggested_ed2_path, "suggested_ed2_path");
+  m.def(
+      "suggested_ed2_path_with_delete",
+      &suggested_ed2_path_with_delete,
+      "suggested_ed2_path_with_delete");
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
new file mode 100644
index 0000000000..6264236915
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/config.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/config.yaml
new file mode 100644
index 0000000000..2ed7168cb7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/config.yaml
@@ -0,0 +1,19 @@
+# @package _group_
+
+hydra:
+  run:
+    dir: .
+
+defaults:
+    - _self_
+    - task: null
+    - model: null
+    - criterion: cross_entropy
+    - optimizer: null
+    - lr_scheduler: fixed
+    - bpe: null
+    - tokenizer: null
+    - scoring: null
+    - generation: null
+    - common_eval: null
+    - eval_lm: null
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/fb_run_config/slurm.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/fb_run_config/slurm.yaml
new file mode 100644
index 0000000000..20cf8f5201
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/fb_run_config/slurm.yaml
@@ -0,0 +1,29 @@
+# @package _global_
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: ':'
+        item_sep: '__'
+        exclude_keys:
+          - fb_run_config
+          - distributed_training.distributed_port
+  sweep:
+    dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname}
+  launcher:
+    cpus_per_task: 60
+    gpus_per_node: ???
+    tasks_per_node: 1
+    nodes: 1
+    partition: learnfair
+    mem_gb: 400
+    timeout_min: 4320
+    max_num_timeout: 10
+    name: ${env:PREFIX}_${hydra.job.config_name}
+    submitit_folder: ${hydra.sweep.dir}
+
+distributed_training:
+  ddp_backend: c10d
+  distributed_world_size: ???
+  distributed_port: ???
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml
new file mode 100644
index 0000000000..30b1a4f1e0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml
new file mode 100644
index 0000000000..1154cfa660
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_big.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_big.yaml
new file mode 100644
index 0000000000..309575310b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_big.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.0
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml
new file mode 100644
index 0000000000..30b1a4f1e0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml
new file mode 100644
index 0000000000..2c6cb7be38
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 768
+decoder_output_dim: 768
+decoder_input_dim: 768
+decoder_ffn_embed_dim: 3072
+decoder_layers: 12
+decoder_attention_heads: 12
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml
new file mode 100644
index 0000000000..a08769a178
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1600
+decoder_output_dim: 1600
+decoder_input_dim: 1600
+decoder_ffn_embed_dim: 6400
+decoder_layers: 48
+decoder_attention_heads: 25
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml
new file mode 100644
index 0000000000..64261d793c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1280
+decoder_output_dim: 1280
+decoder_input_dim: 1280
+decoder_ffn_embed_dim: 5120
+decoder_layers: 36
+decoder_attention_heads: 20
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml
new file mode 100644
index 0000000000..702e81f466
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "gelu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 24
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml
new file mode 100644
index 0000000000..1154cfa660
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml
@@ -0,0 +1,36 @@
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml
new file mode 100644
index 0000000000..ee1329bf46
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml
@@ -0,0 +1,5 @@
+# @package _group_
+activation: gelu
+vq_type: gumbel
+vq_depth: 2
+combine_groups: true
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_base.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_base.yaml
new file mode 100644
index 0000000000..ce65499b80
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_base.yaml
@@ -0,0 +1,8 @@
+# @package _group_
+
+quantize_targets: true
+final_dim: 256
+encoder_layerdrop: 0.05
+dropout_input: 0.1
+dropout_features: 0.1
+feature_grad_mult: 0.1
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_large.yaml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_large.yaml
new file mode 100644
index 0000000000..5846f75243
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/model/wav2vec2/wav2vec2_large.yaml
@@ -0,0 +1,20 @@
+# @package _group_
+
+quantize_targets: true
+extractor_mode: layer_norm
+layer_norm_first: true
+final_dim: 768
+latent_temp: [2.0,0.1,0.999995]
+encoder_layerdrop: 0.0
+dropout_input: 0.0
+dropout_features: 0.0
+dropout: 0.0
+attention_dropout: 0.0
+conv_bias: true
+
+encoder_layers: 24
+encoder_embed_dim: 1024
+encoder_ffn_embed_dim: 4096
+encoder_attention_heads: 16
+
+feature_grad_mult: 1.0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
new file mode 100644
index 0000000000..ecd65d34ad
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import importlib
+import os
+
+from fairseq import registry
+from fairseq.criterions.fairseq_criterion import (  # noqa
+    FairseqCriterion,
+    LegacyFairseqCriterion,
+)
+from omegaconf import DictConfig
+
+
+(
+    build_criterion_,
+    register_criterion,
+    CRITERION_REGISTRY,
+    CRITERION_DATACLASS_REGISTRY,
+) = registry.setup_registry(
+    "--criterion", base_class=FairseqCriterion, default="cross_entropy"
+)
+
+
+def build_criterion(cfg: DictConfig, task, from_checkpoint=False):
+    return build_criterion_(cfg, task, from_checkpoint=from_checkpoint)
+
+
+# automatically import any Python files in the criterions/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        file_name = file[: file.find(".py")]
+        importlib.import_module("fairseq.criterions." + file_name)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
new file mode 100644
index 0000000000..fc1ac85404
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
@@ -0,0 +1,124 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.constants import DDP_BACKEND_CHOICES
+from omegaconf import II
+
+
+@dataclass
+class AdaptiveLossConfig(FairseqDataclass):
+    sentence_avg: bool = II("optimization.sentence_avg")
+    ddp_backend: DDP_BACKEND_CHOICES = II("distributed_training.ddp_backend")
+
+
+@register_criterion("adaptive_loss", dataclass=AdaptiveLossConfig)
+class AdaptiveLoss(FairseqCriterion):
+    """This is an implementation of the loss function accompanying the adaptive softmax approximation for
+    graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs"
+    (http://arxiv.org/abs/1609.04309)."""
+
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+
+    @classmethod
+    def build_criterion(cls, cfg: AdaptiveLossConfig, task):
+        if cfg.ddp_backend in {"c10d", "pytorch_ddp"}:
+            raise Exception(
+                "AdaptiveLoss is not compatible with the PyTorch "
+                "version of DistributedDataParallel. Please use "
+                "`--ddp-backend=legacy_ddp` instead."
+            )
+        return cls(task, cfg.sentence_avg)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+
+        assert (
+            hasattr(model.decoder, "adaptive_softmax")
+            and model.decoder.adaptive_softmax is not None
+        )
+        adaptive_softmax = model.decoder.adaptive_softmax
+
+        net_output = model(**sample["net_input"])
+        orig_target = model.get_targets(sample, net_output)
+
+        nsentences = orig_target.size(0)
+        orig_target = orig_target.view(-1)
+
+        bsz = orig_target.size(0)
+
+        logits, target = adaptive_softmax(net_output[0], orig_target)
+        assert len(target) == len(logits)
+
+        loss = net_output[0].new(1 if reduce else bsz).zero_()
+
+        for i in range(len(target)):
+            if target[i] is not None:
+                assert target[i].min() >= 0 and target[i].max() <= logits[i].size(1)
+                loss += F.cross_entropy(
+                    logits[i],
+                    target[i],
+                    ignore_index=self.padding_idx,
+                    reduction="sum" if reduce else "none",
+                )
+
+        orig = utils.strip_pad(orig_target, self.padding_idx)
+        ntokens = orig.numel()
+        sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
+        logging_output = {
+            "loss": loss.data,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+            )
+        else:
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
new file mode 100644
index 0000000000..98e835fa6e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.criterions import LegacyFairseqCriterion, register_criterion
+from torch import nn
+
+
+@register_criterion("composite_loss")
+class CompositeLoss(LegacyFairseqCriterion):
+    """This is a composite loss that, given a list of model outputs and a list of targets,
+    computes an average of losses for each output-target pair"""
+
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.underlying_criterion = args.underlying_criterion
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True,
+                            help='underlying criterion to use for the composite loss')
+        # fmt: on
+
+    @staticmethod
+    def build_underlying_criterion(args, task):
+        saved_criterion = args.criterion
+        args.criterion = args.underlying_criterion
+        assert saved_criterion != args.underlying_criterion
+        underlying_criterion = task.build_criterion(args)
+        args.criterion = saved_criterion
+        return underlying_criterion
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        underlying_criterion = CompositeLoss.build_underlying_criterion(args, task)
+
+        class FakeModel(nn.Module):
+            def __init__(self, model, net_out, target):
+                super().__init__()
+                self.model = model
+                self.net_out = net_out
+                self.target = target
+
+            def forward(self, **unused):
+                return self.net_out
+
+            def get_normalized_probs(self, net_output, log_probs, sample=None):
+                return self.model.get_normalized_probs(
+                    net_output, log_probs, sample=sample
+                )
+
+            def get_targets(self, *unused):
+                return self.target
+
+            @property
+            def decoder(self):
+                return self.model.decoder
+
+        class _CompositeLoss(LegacyFairseqCriterion):
+            def __init__(self, args, task, underlying_criterion):
+                super().__init__(args, task)
+                self.underlying_criterion = underlying_criterion
+
+            def forward(self, model, sample, reduce=True):
+                net_outputs = model(**sample["net_input"])
+                targets = sample["target"]
+
+                bsz = targets[0].size(0)
+                loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_()
+
+                sample_size = 0
+                logging_output = {}
+                for o, t in zip(net_outputs[0], targets):
+                    m = FakeModel(model, (o, net_outputs[1]), t)
+                    sample["target"] = t
+                    l, ss, logging_output = self.underlying_criterion(m, sample, reduce)
+                    loss += l
+                    sample_size += ss
+
+                loss.div_(len(targets))
+                sample_size /= len(targets)
+
+                logging_output["loss"] = utils.item(loss.data) if reduce else loss.data
+                return loss, sample_size, logging_output
+
+            @staticmethod
+            def aggregate_logging_outputs(logging_outputs):
+                return underlying_criterion.__class__.aggregate_logging_outputs(
+                    logging_outputs
+                )
+
+            @staticmethod
+            def reduce_metrics(logging_outputs) -> None:
+                underlying_criterion.__class__.reduce_metrics(logging_outputs)
+
+        return _CompositeLoss(args, task, underlying_criterion)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
new file mode 100644
index 0000000000..24d6bcd612
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
@@ -0,0 +1,91 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II
+
+
+@dataclass
+class CrossEntropyCriterionConfig(FairseqDataclass):
+    sentence_avg: bool = II("optimization.sentence_avg")
+
+
+@register_criterion("cross_entropy", dataclass=CrossEntropyCriterionConfig)
+class CrossEntropyCriterion(FairseqCriterion):
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        target = model.get_targets(sample, net_output).view(-1)
+        loss = F.nll_loss(
+            lprobs,
+            target,
+            ignore_index=self.padding_idx,
+            reduction="sum" if reduce else "none",
+        )
+        return loss, loss
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        # we divide by log(2) to convert the loss from base e to base 2
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+            )
+        else:
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py
new file mode 100644
index 0000000000..368213cb2b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py
@@ -0,0 +1,325 @@
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+from argparse import Namespace
+from dataclasses import dataclass, field
+from omegaconf import II
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.data.data_utils import post_process
+from fairseq.tasks import FairseqTask
+from fairseq.logging.meters import safe_round
+
+
+@dataclass
+class CtcCriterionConfig(FairseqDataclass):
+    zero_infinity: bool = field(
+        default=False,
+        metadata={"help": "zero inf loss when source length <= target length"},
+    )
+    sentence_avg: bool = II("optimization.sentence_avg")
+    post_process: str = field(
+        default="letter",
+        metadata={
+            "help": "how to post process predictions into words. can be letter, "
+            "wordpiece, BPE symbols, etc. "
+            "See fairseq.data.data_utils.post_process() for full list of options"
+        },
+    )
+    wer_kenlm_model: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "if this is provided, use kenlm to compute wer (along with other wer_* args)"
+        },
+    )
+    wer_lexicon: Optional[str] = field(
+        default=None,
+        metadata={"help": "lexicon to use with wer_kenlm_model"},
+    )
+    wer_lm_weight: float = field(
+        default=2.0,
+        metadata={"help": "lm weight to use with wer_kenlm_model"},
+    )
+    wer_word_score: float = field(
+        default=-1.0,
+        metadata={"help": "lm word score to use with wer_kenlm_model"},
+    )
+    wer_sil_weight: float = field(
+        default=0,
+        metadata={"help": "lm word score to use with wer_kenlm_model"},
+    )
+
+    wer_args: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "DEPRECATED: tuple of (wer_kenlm_model, wer_lexicon, wer_lm_weight, wer_word_score)"
+        },
+    )
+
+
+@register_criterion("ctc", dataclass=CtcCriterionConfig)
+class CtcCriterion(FairseqCriterion):
+    def __init__(
+        self, cfg: CtcCriterionConfig, task: FairseqTask, rdrop_alpha: int = 0.0
+    ):
+        super().__init__(task)
+        self.blank_idx = (
+            task.target_dictionary.index(task.blank_symbol)
+            if hasattr(task, "blank_symbol")
+            else 0
+        )
+        self.pad_idx = task.target_dictionary.pad()
+        self.eos_idx = task.target_dictionary.eos()
+        self.post_process = cfg.post_process
+
+        self.rdrop_alpha = rdrop_alpha
+
+        if cfg.wer_args is not None:
+            (
+                cfg.wer_kenlm_model,
+                cfg.wer_lexicon,
+                cfg.wer_lm_weight,
+                cfg.wer_word_score,
+            ) = eval(cfg.wer_args)
+
+        if cfg.wer_kenlm_model is not None and cfg.wer_kenlm_model != "":
+            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+
+            dec_args = Namespace()
+            dec_args.nbest = 1
+            dec_args.criterion = "ctc"
+            dec_args.kenlm_model = cfg.wer_kenlm_model
+            dec_args.lexicon = cfg.wer_lexicon
+            dec_args.beam = 50
+            dec_args.beam_size_token = min(50, len(task.target_dictionary))
+            dec_args.beam_threshold = min(50, len(task.target_dictionary))
+            dec_args.lm_weight = cfg.wer_lm_weight
+            dec_args.word_score = cfg.wer_word_score
+            dec_args.sil_weight = cfg.wer_sil_weight
+            dec_args.unk_weight = -math.inf
+            dec_args.sil_weight = 0
+
+            self.w2l_decoder = W2lKenLMDecoder(dec_args, task.target_dictionary)
+        else:
+            self.w2l_decoder = None
+
+        self.zero_infinity = cfg.zero_infinity
+        self.sentence_avg = cfg.sentence_avg
+
+    def forward(self, model, sample, reduce=True, **kwargs):
+        net_output = model(**sample["net_input"])
+        lprobs = model.get_normalized_probs(
+            net_output, log_probs=True
+        ).contiguous()  # (T, B, C) from the encoder
+
+        # CTC loss is calculated over duplicated inputs
+        # sample is already duplicated for R-Drop
+        if self.rdrop_alpha > 0:
+            for k, v in sample.items():
+                if k in ["target", "target_lengths"]:
+                    sample[k] = torch.cat([v, v.clone()], dim=0)
+                elif k == "net_input":
+                    if sample[k]["src_tokens"].size(1) != sample[k]["src_lengths"].size(
+                        0
+                    ):
+                        # for decoder CTC loss
+                        sample[k]["src_lengths"] = torch.cat(
+                            [
+                                sample[k]["src_lengths"],
+                                sample[k]["src_lengths"].clone(),
+                            ],
+                            dim=0,
+                        )
+
+        if "src_lengths" in sample["net_input"]:
+            input_lengths = sample["net_input"]["src_lengths"]
+        else:
+            if net_output["padding_mask"] is not None:
+                non_padding_mask = ~net_output["padding_mask"]
+                input_lengths = non_padding_mask.long().sum(-1)
+            else:
+                input_lengths = lprobs.new_full(
+                    (lprobs.size(1),), lprobs.size(0), dtype=torch.long
+                )
+
+        pad_mask = (sample["target"] != self.pad_idx) & (
+            sample["target"] != self.eos_idx
+        )
+        targets_flat = sample["target"].masked_select(pad_mask)
+        if "target_lengths" in sample:
+            target_lengths = sample["target_lengths"]
+        else:
+            target_lengths = pad_mask.sum(-1)
+
+        with torch.backends.cudnn.flags(enabled=False):
+            loss = F.ctc_loss(
+                lprobs,
+                targets_flat,
+                input_lengths,
+                target_lengths,
+                blank=self.blank_idx,
+                reduction="sum",
+                zero_infinity=self.zero_infinity,
+            )
+
+        ntokens = (
+            sample["ntokens"] if "ntokens" in sample else target_lengths.sum().item()
+        )
+
+        sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "ntokens": ntokens,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+        }
+
+        if not model.training:
+            import editdistance
+
+            with torch.no_grad():
+                lprobs_t = lprobs.transpose(0, 1).float().contiguous().cpu()
+
+                c_err = 0
+                c_len = 0
+                w_errs = 0
+                w_len = 0
+                wv_errs = 0
+                for lp, t, inp_l in zip(
+                    lprobs_t,
+                    sample["target_label"]
+                    if "target_label" in sample
+                    else sample["target"],
+                    input_lengths,
+                ):
+                    lp = lp[:inp_l].unsqueeze(0)
+
+                    decoded = None
+                    if self.w2l_decoder is not None:
+                        decoded = self.w2l_decoder.decode(lp)
+                        if len(decoded) < 1:
+                            decoded = None
+                        else:
+                            decoded = decoded[0]
+                            if len(decoded) < 1:
+                                decoded = None
+                            else:
+                                decoded = decoded[0]
+
+                    p = (t != self.task.target_dictionary.pad()) & (
+                        t != self.task.target_dictionary.eos()
+                    )
+                    targ = t[p]
+                    targ_units = self.task.target_dictionary.string(targ)
+                    targ_units_arr = targ.tolist()
+
+                    toks = lp.argmax(dim=-1).unique_consecutive()
+                    pred_units_arr = toks[toks != self.blank_idx].tolist()
+
+                    c_err += editdistance.eval(pred_units_arr, targ_units_arr)
+                    c_len += len(targ_units_arr)
+
+                    targ_words = post_process(targ_units, self.post_process).split()
+
+                    pred_units = self.task.target_dictionary.string(pred_units_arr)
+                    pred_words_raw = post_process(pred_units, self.post_process).split()
+
+                    if decoded is not None and "words" in decoded:
+                        pred_words = decoded["words"]
+                        w_errs += editdistance.eval(pred_words, targ_words)
+                        wv_errs += editdistance.eval(pred_words_raw, targ_words)
+                    else:
+                        dist = editdistance.eval(pred_words_raw, targ_words)
+                        w_errs += dist
+                        wv_errs += dist
+
+                    w_len += len(targ_words)
+
+                logging_output["wv_errors"] = wv_errs
+                logging_output["w_errors"] = w_errs
+                logging_output["w_total"] = w_len
+                logging_output["c_errors"] = c_err
+                logging_output["c_total"] = c_len
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        nsentences = utils.item(
+            sum(log.get("nsentences", 0) for log in logging_outputs)
+        )
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar("ntokens", ntokens)
+        metrics.log_scalar("nsentences", nsentences)
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+
+        c_errors = sum(log.get("c_errors", 0) for log in logging_outputs)
+        metrics.log_scalar("_c_errors", c_errors)
+        c_total = sum(log.get("c_total", 0) for log in logging_outputs)
+        metrics.log_scalar("_c_total", c_total)
+        w_errors = sum(log.get("w_errors", 0) for log in logging_outputs)
+        metrics.log_scalar("_w_errors", w_errors)
+        wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs)
+        metrics.log_scalar("_wv_errors", wv_errors)
+        w_total = sum(log.get("w_total", 0) for log in logging_outputs)
+        metrics.log_scalar("_w_total", w_total)
+
+        if c_total > 0:
+            metrics.log_derived(
+                "uer",
+                lambda meters: safe_round(
+                    meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
+                )
+                if meters["_c_total"].sum > 0
+                else float("nan"),
+            )
+        if w_total > 0:
+            metrics.log_derived(
+                "wer",
+                lambda meters: safe_round(
+                    meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3
+                )
+                if meters["_w_total"].sum > 0
+                else float("nan"),
+            )
+            metrics.log_derived(
+                "raw_wer",
+                lambda meters: safe_round(
+                    meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3
+                )
+                if meters["_w_total"].sum > 0
+                else float("nan"),
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
new file mode 100644
index 0000000000..0b1e64a8e3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Any, Dict, List
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from torch.nn.modules.loss import _Loss
+
+
+class FairseqCriterion(_Loss):
+    def __init__(self, task):
+        super().__init__()
+        self.task = task
+        if hasattr(task, "target_dictionary"):
+            tgt_dict = task.target_dictionary
+            self.padding_idx = tgt_dict.pad() if tgt_dict is not None else -100
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add criterion-specific arguments to the parser."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            gen_parser_from_dataclass(parser, dc())
+
+    @classmethod
+    def build_criterion(cls, cfg: FairseqDataclass, task):
+        """Construct a criterion from command-line args."""
+        # arguments in the __init__.
+        init_args = {}
+        for p in inspect.signature(cls).parameters.values():
+            if (
+                p.kind == p.POSITIONAL_ONLY
+                or p.kind == p.VAR_POSITIONAL
+                or p.kind == p.VAR_KEYWORD
+            ):
+                # we haven't implemented inference for these argument types,
+                # but PRs welcome :)
+                raise NotImplementedError("{} not supported".format(p.kind))
+
+            assert p.kind in {p.POSITIONAL_OR_KEYWORD, p.KEYWORD_ONLY}
+
+            if p.name == "task":
+                init_args["task"] = task
+            elif p.name == "cfg":
+                init_args["cfg"] = cfg
+            elif hasattr(cfg, p.name):
+                init_args[p.name] = getattr(cfg, p.name)
+            elif p.default != p.empty:
+                pass  # we'll use the default value
+            else:
+                raise NotImplementedError(
+                    "Unable to infer Criterion arguments, please implement "
+                    "{}.build_criterion".format(cls.__name__)
+                )
+        return cls(**init_args)
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def aggregate_logging_outputs(
+        logging_outputs: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """Aggregate logging outputs from data parallel training."""
+        utils.deprecation_warning(
+            "The aggregate_logging_outputs API is deprecated. "
+            "Please use the reduce_metrics API instead."
+        )
+        raise NotImplementedError
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        utils.deprecation_warning(
+            "Criterions should implement the reduce_metrics API. "
+            "Falling back to deprecated aggregate_logging_outputs API."
+        )
+        agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs)
+        for k, v in agg_logging_outputs.items():
+            if k in {"nsentences", "ntokens", "sample_size"}:
+                continue
+            metrics.log_scalar(k, v)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False
+
+
+class LegacyFairseqCriterion(FairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(task=task)
+        self.args = args
+
+        utils.deprecation_warning(
+            "Criterions should take explicit arguments instead of an "
+            "argparse.Namespace object, please update your criterion by "
+            "extending FairseqCriterion instead of LegacyFairseqCriterion."
+        )
+
+    @classmethod
+    def build_criterion(cls, args, task):
+        """Construct a criterion from command-line args."""
+        return cls(args, task)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py
new file mode 100644
index 0000000000..ab7cd08e3b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from typing import List, Dict, Any
+from dataclasses import dataclass, field
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.data.data_utils import lengths_to_mask
+from fairseq.models.fairseq_model import FairseqEncoderModel
+
+
+@dataclass
+class FastSpeech2CriterionConfig(FairseqDataclass):
+    ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"})
+
+
+@register_criterion("fastspeech2", dataclass=FastSpeech2CriterionConfig)
+class FastSpeech2Loss(FairseqCriterion):
+    def __init__(self, task, ctc_weight):
+        super().__init__(task)
+        self.ctc_weight = ctc_weight
+
+    def forward(self, model: FairseqEncoderModel, sample, reduction="mean"):
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lens = sample["net_input"]["src_lengths"]
+        tgt_lens = sample["target_lengths"]
+        _feat_out, _feat_out_post, _, log_dur_out, pitch_out, energy_out = model(
+            src_tokens=src_tokens,
+            src_lengths=src_lens,
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            incremental_state=None,
+            target_lengths=tgt_lens,
+            speaker=sample["speaker"],
+            durations=sample["durations"],
+            pitches=sample["pitches"],
+            energies=sample["energies"],
+        )
+
+        src_mask = lengths_to_mask(sample["net_input"]["src_lengths"])
+        tgt_mask = lengths_to_mask(sample["target_lengths"])
+
+        pitches, energies = sample["pitches"], sample["energies"]
+        pitch_out, pitches = pitch_out[src_mask], pitches[src_mask]
+        energy_out, energies = energy_out[src_mask], energies[src_mask]
+
+        feat_out, feat = _feat_out[tgt_mask], sample["target"][tgt_mask]
+        l1_loss = F.l1_loss(feat_out, feat, reduction=reduction)
+        if _feat_out_post is not None:
+            l1_loss += F.l1_loss(_feat_out_post[tgt_mask], feat, reduction=reduction)
+
+        pitch_loss = F.mse_loss(pitch_out, pitches, reduction=reduction)
+        energy_loss = F.mse_loss(energy_out, energies, reduction=reduction)
+
+        log_dur_out = log_dur_out[src_mask]
+        dur = sample["durations"].float()
+        dur = dur.half() if log_dur_out.type().endswith(".HalfTensor") else dur
+        log_dur = torch.log(dur + 1)[src_mask]
+        dur_loss = F.mse_loss(log_dur_out, log_dur, reduction=reduction)
+
+        ctc_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.ctc_weight > 0.0:
+            lprobs = model.get_normalized_probs((_feat_out,), log_probs=True)
+            lprobs = lprobs.transpose(0, 1)  # T x B x C
+            src_mask = lengths_to_mask(src_lens)
+            src_tokens_flat = src_tokens.masked_select(src_mask)
+            ctc_loss = (
+                F.ctc_loss(
+                    lprobs,
+                    src_tokens_flat,
+                    tgt_lens,
+                    src_lens,
+                    reduction=reduction,
+                    zero_infinity=True,
+                )
+                * self.ctc_weight
+            )
+
+        loss = l1_loss + dur_loss + pitch_loss + energy_loss + ctc_loss
+
+        sample_size = sample["nsentences"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "dur_loss": utils.item(dur_loss.data),
+            "pitch_loss": utils.item(pitch_loss.data),
+            "energy_loss": utils.item(energy_loss.data),
+            "ctc_loss": utils.item(ctc_loss.data),
+        }
+        return loss, sample_size, logging_output
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
+        ns = [log.get("sample_size", 0) for log in logging_outputs]
+        ntot = sum(ns)
+        ws = [n / (ntot + 1e-8) for n in ns]
+        for key in [
+            "loss",
+            "l1_loss",
+            "dur_loss",
+            "pitch_loss",
+            "energy_loss",
+            "ctc_loss",
+        ]:
+            vals = [log.get(key, 0) for log in logging_outputs]
+            val = sum(val * w for val, w in zip(vals, ws))
+            metrics.log_scalar(key, val, ntot, round=3)
+        metrics.log_scalar("sample_size", ntot, len(logging_outputs))
+
+        # inference metrics
+        if "targ_frames" not in logging_outputs[0]:
+            return
+        n = sum(log.get("targ_frames", 0) for log in logging_outputs)
+        for key, new_key in [
+            ("mcd_loss", "mcd_loss"),
+            ("pred_frames", "pred_ratio"),
+            ("nins", "ins_rate"),
+            ("ndel", "del_rate"),
+        ]:
+            val = sum(log.get(key, 0) for log in logging_outputs)
+            metrics.log_scalar(new_key, val / n, n, round=3)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
new file mode 100644
index 0000000000..262874b582
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import re
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class HubertCriterionConfig(FairseqDataclass):
+    pred_masked_weight: float = field(
+        default=1.0,
+        metadata={"help": "weight for predictive loss for masked frames"},
+    )
+    pred_nomask_weight: float = field(
+        default=0.0,
+        metadata={"help": "weight for predictive loss for unmasked frames"},
+    )
+    loss_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "weights for additional loss terms (not first one)"},
+    )
+    log_keys: List[str] = field(
+        default_factory=lambda: [],
+        metadata={"help": "output keys to log"},
+    )
+
+
+@register_criterion("hubert", dataclass=HubertCriterionConfig)
+class HubertCriterion(FairseqCriterion):
+    def __init__(
+        self,
+        task,
+        pred_masked_weight,
+        pred_nomask_weight,
+        loss_weights=None,
+        log_keys=None,
+    ):
+        super().__init__(task)
+        self.pred_masked_weight = pred_masked_weight
+        self.pred_nomask_weight = pred_nomask_weight
+        self.loss_weights = loss_weights
+        self.log_keys = [] if log_keys is None else log_keys
+
+    def forward(self, model, sample, reduce=True, log_pred=False):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(target_list=sample["target_list"], **sample["net_input"])
+        loss = 0.0
+        sample_size = 0
+        logging_output = {}
+        reduction = "sum" if reduce else "none"
+
+        loss_m_list = []
+        logp_m_list = model.get_logits(net_output, True)
+        targ_m_list = model.get_targets(net_output, True)
+        assert self.pred_masked_weight == 0 or len(logp_m_list) > 0
+        for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)):
+            loss_m = F.cross_entropy(logp_m, targ_m, reduction=reduction)
+            loss_m_list.append(loss_m)
+            logging_output[f"loss_m_{i}"] = loss_m.detach().item()
+        if self.pred_masked_weight > 0:
+            loss += self.pred_masked_weight * sum(loss_m_list)
+            sample_size += targ_m_list[0].numel()
+
+        loss_u_list = []
+        logp_u_list = model.get_logits(net_output, False)
+        targ_u_list = model.get_targets(net_output, False)
+        assert self.pred_nomask_weight == 0 or len(logp_u_list) > 0
+        for i, (logp_u, targ_u) in enumerate(zip(logp_u_list, targ_u_list)):
+            loss_u = F.cross_entropy(logp_u, targ_u, reduction=reduction)
+            loss_u_list.append(loss_u)
+            logging_output[f"loss_u_{i}"] = loss_u.detach().item()
+        if self.pred_nomask_weight > 0:
+            loss += self.pred_nomask_weight * sum(loss_u_list)
+            sample_size += targ_u_list[0].numel()
+
+        if self.loss_weights is not None:
+            assert hasattr(model, "get_extra_losses")
+            extra_losses, names = model.get_extra_losses(net_output)
+            if torch.is_tensor(extra_losses):
+                extra_losses = [extra_losses]
+                names = [names]
+            if len(self.loss_weights) == 1 and len(extra_losses) != 1:
+                self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
+            assert len(extra_losses) == len(
+                self.loss_weights
+            ), f"{len(extra_losses)}, {len(self.loss_weights)}"
+            for p, n, coef in zip(extra_losses, names, self.loss_weights):
+                if coef != 0 and p is not None:
+                    p = coef * p.float() * sample_size
+                    loss += p
+                    logging_output[f"loss_{n}"] = p.item()
+
+        logging_output = {
+            "loss": loss.item() if reduce else loss,
+            "ntokens": sample_size,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+            **logging_output,
+        }
+
+        for lk in self.log_keys:
+            if lk in net_output:
+                logging_output[lk] = float((net_output[lk]))
+
+        def compute_correct(logits):
+            if logits.numel() == 0:
+                return 0, 0
+            else:
+                assert logits.dim() > 1, logits.shape
+                max = logits.argmax(-1) == 0
+                min = logits.argmin(-1) == 0
+                both = max & min
+                corr = max.long().sum().item() - both.long().sum().item()
+                count = max.numel()
+                return corr, count
+
+        with torch.no_grad():
+            for i, logp_m in enumerate(logp_m_list):
+                corr_m, count_m = compute_correct(logp_m)
+                logging_output[f"correct_m_{i}"] = corr_m
+                logging_output[f"count_m_{i}"] = count_m
+
+            for i, logp_u in enumerate(logp_u_list):
+                corr_u, count_u = compute_correct(logp_u)
+                logging_output[f"correct_u_{i}"] = corr_u
+                logging_output[f"count_u_{i}"] = count_u
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training (copied from normal cross entropy)."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+            )
+        else:
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+            )
+
+        counts = {}
+        for lk in logging_outputs[0].keys():
+            if lk.startswith("count_"):
+                val = sum(log[lk] for log in logging_outputs)
+                metrics.log_scalar(lk, val)
+                counts[lk] = val
+
+        for lk in logging_outputs[0].keys():
+            if lk.startswith("loss_"):
+                val = sum(log[lk] for log in logging_outputs)
+                metrics.log_scalar(lk, val / sample_size / math.log(2), round=3)
+            elif lk.startswith("correct_"):
+                val = sum(log[lk] for log in logging_outputs)
+                metrics.log_scalar(lk, val / counts[re.sub("correct", "count", lk)])
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        raise NotImplementedError()
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
new file mode 100644
index 0000000000..325679bb16
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+
+import torch
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II
+
+
+@dataclass
+class LabelSmoothedCrossEntropyCriterionConfig(FairseqDataclass):
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"},
+    )
+    report_accuracy: bool = field(
+        default=False,
+        metadata={"help": "report accuracy metric"},
+    )
+    ignore_prefix_size: int = field(
+        default=0,
+        metadata={"help": "Ignore first N tokens"},
+    )
+    sentence_avg: bool = II("optimization.sentence_avg")
+
+
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.0)
+        smooth_loss.masked_fill_(pad_mask, 0.0)
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+    if reduce:
+        nll_loss = nll_loss.sum()
+        smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / (lprobs.size(-1) - 1)
+    loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+
+
+@register_criterion(
+    "label_smoothed_cross_entropy", dataclass=LabelSmoothedCrossEntropyCriterionConfig
+)
+class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size=0,
+        report_accuracy=False,
+    ):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        self.eps = label_smoothing
+        self.ignore_prefix_size = ignore_prefix_size
+        self.report_accuracy = report_accuracy
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, net_output, sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        return loss, sample_size, logging_output
+
+    def get_lprobs_and_target(self, model, net_output, sample):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        target = model.get_targets(sample, net_output)
+        if self.ignore_prefix_size > 0:
+            # lprobs: B x T x C
+            lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
+            target = target[:, self.ignore_prefix_size :].contiguous()
+        return lprobs.view(-1, lprobs.size(-1)), target.view(-1)
+
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
+        loss, nll_loss = label_smoothed_nll_loss(
+            lprobs,
+            target,
+            self.eps,
+            ignore_index=self.padding_idx,
+            reduce=reduce,
+        )
+        return loss, nll_loss
+
+    def compute_accuracy(self, model, net_output, sample):
+        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
+        mask = target.ne(self.padding_idx)
+        n_correct = torch.sum(
+            lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))
+        )
+        total = torch.sum(mask)
+        return n_correct, total
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar(
+            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
+        )
+        metrics.log_derived(
+            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+        )
+
+        total = utils.item(sum(log.get("total", 0) for log in logging_outputs))
+        if total > 0:
+            metrics.log_scalar("total", total)
+            n_correct = utils.item(
+                sum(log.get("n_correct", 0) for log in logging_outputs)
+            )
+            metrics.log_scalar("n_correct", n_correct)
+            metrics.log_derived(
+                "accuracy",
+                lambda meters: round(
+                    meters["n_correct"].sum * 100.0 / meters["total"].sum, 3
+                )
+                if meters["total"].sum > 0
+                else float("nan"),
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
new file mode 100644
index 0000000000..6eaedab9cf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
@@ -0,0 +1,221 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+import torch
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+    LabelSmoothedCrossEntropyCriterionConfig,
+)
+
+try:
+    from simuleval.metrics.latency import (
+        AverageLagging,
+        AverageProportion,
+        DifferentiableAverageLagging,
+    )
+
+    LATENCY_METRICS = {
+        "average_lagging": AverageLagging,
+        "average_proportion": AverageProportion,
+        "differentiable_average_lagging": DifferentiableAverageLagging,
+    }
+except ImportError:
+    LATENCY_METRICS = None
+
+
+@dataclass
+class LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig(
+    LabelSmoothedCrossEntropyCriterionConfig
+):
+    latency_avg_weight: float = field(
+        default=0.0,
+        metadata={"help": "weight fot average latency loss."},
+    )
+    latency_var_weight: float = field(
+        default=0.0,
+        metadata={"help": "weight fot variance latency loss."},
+    )
+    latency_avg_type: str = field(
+        default="differentiable_average_lagging",
+        metadata={"help": "latency type for average loss"},
+    )
+    latency_var_type: str = field(
+        default="variance_delay",
+        metadata={"help": "latency typ for variance loss"},
+    )
+    latency_gather_method: str = field(
+        default="weighted_average",
+        metadata={"help": "method to gather latency loss for all heads"},
+    )
+    latency_update_after: int = field(
+        default=0,
+        metadata={"help": "Add latency loss after certain steps"},
+    )
+
+
+@register_criterion(
+    "latency_augmented_label_smoothed_cross_entropy",
+    dataclass=LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig,
+)
+class LatencyAugmentedLabelSmoothedCrossEntropyCriterion(
+    LabelSmoothedCrossEntropyCriterion
+):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size,
+        report_accuracy,
+        latency_avg_weight,
+        latency_var_weight,
+        latency_avg_type,
+        latency_var_type,
+        latency_gather_method,
+        latency_update_after,
+    ):
+        super().__init__(
+            task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy
+        )
+        assert LATENCY_METRICS is not None, "Please make sure SimulEval is installed."
+
+        self.latency_avg_weight = latency_avg_weight
+        self.latency_var_weight = latency_var_weight
+        self.latency_avg_type = latency_avg_type
+        self.latency_var_type = latency_var_type
+        self.latency_gather_method = latency_gather_method
+        self.latency_update_after = latency_update_after
+
+    def forward(self, model, sample, reduce=True):
+        net_output = model(**sample["net_input"])
+        # 1. Compute cross entropy loss
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+
+        # 2. Compute cross latency loss
+        latency_loss, expected_latency, expected_delays_var = self.compute_latency_loss(
+            model, sample, net_output
+        )
+
+        if self.latency_update_after > 0:
+            num_updates = getattr(model.decoder, "num_updates", None)
+            assert (
+                num_updates is not None
+            ), "model.decoder doesn't have attribute 'num_updates'"
+            if num_updates <= self.latency_update_after:
+                latency_loss = 0
+
+        loss += latency_loss
+
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "latency": expected_latency,
+            "delays_var": expected_delays_var,
+            "latency_loss": latency_loss,
+        }
+
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, net_output, sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        return loss, sample_size, logging_output
+
+    def compute_latency_loss(self, model, sample, net_output):
+        assert (
+            net_output[-1].encoder_padding_mask is None
+            or not net_output[-1].encoder_padding_mask[:, 0].any()
+        ), "Only right padding on source is supported."
+        # 1. Obtain the expected alignment
+        alpha_list = [item["alpha"] for item in net_output[1].attn_list]
+        num_layers = len(alpha_list)
+        bsz, num_heads, tgt_len, src_len = alpha_list[0].size()
+
+        # bsz * num_layers * num_heads, tgt_len, src_len
+        alpha_all = torch.cat(alpha_list, dim=1).view(-1, tgt_len, src_len)
+
+        # 2 compute expected delays
+        # bsz * num_heads * num_layers, tgt_len, src_len for MMA
+        steps = (
+            torch.arange(1, 1 + src_len)
+            .unsqueeze(0)
+            .unsqueeze(1)
+            .expand_as(alpha_all)
+            .type_as(alpha_all)
+        )
+
+        expected_delays = torch.sum(steps * alpha_all, dim=-1)
+
+        target_padding_mask = (
+            model.get_targets(sample, net_output)
+            .eq(self.padding_idx)
+            .unsqueeze(1)
+            .expand(bsz, num_layers * num_heads, tgt_len)
+            .contiguous()
+            .view(-1, tgt_len)
+        )
+
+        src_lengths = (
+            sample["net_input"]["src_lengths"]
+            .unsqueeze(1)
+            .expand(bsz, num_layers * num_heads)
+            .contiguous()
+            .view(-1)
+        )
+        expected_latency = LATENCY_METRICS[self.latency_avg_type](
+            expected_delays, src_lengths, None, target_padding_mask=target_padding_mask
+        )
+
+        # 2.1 average expected latency of heads
+        # bsz, num_layers * num_heads
+        expected_latency = expected_latency.view(bsz, -1)
+        if self.latency_gather_method == "average":
+            # bsz * tgt_len
+            expected_latency = expected_delays.mean(dim=1)
+        elif self.latency_gather_method == "weighted_average":
+            weights = torch.nn.functional.softmax(expected_latency, dim=1)
+            expected_latency = torch.sum(expected_latency * weights, dim=1)
+        elif self.latency_gather_method == "max":
+            expected_latency = expected_latency.max(dim=1)[0]
+        else:
+            raise NotImplementedError
+
+        expected_latency = expected_latency.sum()
+        avg_loss = self.latency_avg_weight * expected_latency
+
+        # 2.2 variance of expected delays
+        expected_delays_var = (
+            expected_delays.view(bsz, -1, tgt_len).var(dim=1).mean(dim=1)
+        )
+        expected_delays_var = expected_delays_var.sum()
+        var_loss = self.latency_avg_weight * expected_delays_var
+
+        # 3. Final loss
+        latency_loss = avg_loss + var_loss
+
+        return latency_loss, expected_latency, expected_delays_var
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        super().reduce_metrics(logging_outputs)
+        latency = sum(log.get("latency", 0) for log in logging_outputs)
+        delays_var = sum(log.get("delays_var", 0) for log in logging_outputs)
+        latency_loss = sum(log.get("latency_loss", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        metrics.log_scalar("latency", latency.float() / nsentences, nsentences, round=3)
+        metrics.log_scalar("delays_var", delays_var / nsentences, nsentences, round=3)
+        metrics.log_scalar(
+            "latency_loss", latency_loss / nsentences, nsentences, round=3
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
new file mode 100644
index 0000000000..b55f65e5cc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
@@ -0,0 +1,131 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+
+from .label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+    LabelSmoothedCrossEntropyCriterionConfig,
+)
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class LabelSmoothedCrossEntropyCriterionWithAlignmentConfig(
+    LabelSmoothedCrossEntropyCriterionConfig
+):
+    alignment_lambda: float = field(
+        default=0.05, metadata={"help": "weight for the alignment loss"}
+    )
+
+
+@register_criterion(
+    "label_smoothed_cross_entropy_with_alignment",
+    dataclass=LabelSmoothedCrossEntropyCriterionWithAlignmentConfig,
+)
+class LabelSmoothedCrossEntropyCriterionWithAlignment(
+    LabelSmoothedCrossEntropyCriterion
+):
+    def __init__(self, task, sentence_avg, label_smoothing, alignment_lambda):
+        super().__init__(task, sentence_avg, label_smoothing)
+        self.alignment_lambda = alignment_lambda
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "nll_loss": utils.item(nll_loss.data) if reduce else nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+
+        alignment_loss = None
+
+        # Compute alignment loss only for training set and non dummy batches.
+        if "alignments" in sample and sample["alignments"] is not None:
+            alignment_loss = self.compute_alignment_loss(sample, net_output)
+
+        if alignment_loss is not None:
+            logging_output["alignment_loss"] = utils.item(alignment_loss.data)
+            loss += self.alignment_lambda * alignment_loss
+
+        return loss, sample_size, logging_output
+
+    def compute_alignment_loss(self, sample, net_output):
+        attn_prob = net_output[1]["attn"][0]
+        bsz, tgt_sz, src_sz = attn_prob.shape
+        attn = attn_prob.view(bsz * tgt_sz, src_sz)
+
+        align = sample["alignments"]
+        align_weights = sample["align_weights"].float()
+
+        if len(align) > 0:
+            # Alignment loss computation. align (shape [:, 2]) contains the src-tgt index pairs corresponding to
+            # the alignments. align_weights (shape [:]) contains the 1 / frequency of a tgt index for normalizing.
+            loss = -(
+                (attn[align[:, 1][:, None], align[:, 0][:, None]]).log()
+                * align_weights[:, None]
+            ).sum()
+        else:
+            return None
+
+        return loss
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        nll_loss_sum = utils.item(
+            sum(log.get("nll_loss", 0) for log in logging_outputs)
+        )
+        alignment_loss_sum = utils.item(
+            sum(log.get("alignment_loss", 0) for log in logging_outputs)
+        )
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar(
+            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
+        )
+        metrics.log_scalar(
+            "alignment_loss",
+            alignment_loss_sum / sample_size / math.log(2),
+            sample_size,
+            round=3,
+        )
+        metrics.log_derived(
+            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+        )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
new file mode 100644
index 0000000000..f2e8cdf3bf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -0,0 +1,97 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+    LabelSmoothedCrossEntropyCriterionConfig,
+)
+from fairseq.data.data_utils import lengths_to_mask
+
+
+@dataclass
+class LabelSmoothedCrossEntropyWithCtcCriterionConfig(
+    LabelSmoothedCrossEntropyCriterionConfig
+):
+    ctc_weight: float = field(default=1.0, metadata={"help": "weight for CTC loss"})
+
+
+@register_criterion(
+    "label_smoothed_cross_entropy_with_ctc",
+    dataclass=LabelSmoothedCrossEntropyWithCtcCriterionConfig,
+)
+class LabelSmoothedCrossEntropyWithCtcCriterion(LabelSmoothedCrossEntropyCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size,
+        report_accuracy,
+        ctc_weight,
+    ):
+        super().__init__(
+            task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy
+        )
+        self.ctc_weight = ctc_weight
+
+    def forward(self, model, sample, reduce=True):
+        net_output = model(**sample["net_input"])
+        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
+
+        ctc_loss = torch.tensor(0.0).type_as(loss)
+        if self.ctc_weight > 0.0:
+            ctc_lprobs, ctc_lens = model.get_ctc_output(net_output, sample)
+            ctc_tgt, ctc_tgt_lens = model.get_ctc_target(sample)
+            ctc_tgt_mask = lengths_to_mask(ctc_tgt_lens)
+            ctc_tgt_flat = ctc_tgt.masked_select(ctc_tgt_mask)
+            reduction = "sum" if reduce else "none"
+            ctc_loss = (
+                F.ctc_loss(
+                    ctc_lprobs,
+                    ctc_tgt_flat,
+                    ctc_lens,
+                    ctc_tgt_lens,
+                    reduction=reduction,
+                    zero_infinity=True,
+                )
+                * self.ctc_weight
+            )
+        loss += ctc_loss
+
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "nll_loss": utils.item(nll_loss.data),
+            "ctc_loss": utils.item(ctc_loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, net_output, sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        return loss, sample_size, logging_output
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        super().reduce_metrics(logging_outputs)
+        loss_sum = sum(log.get("ctc_loss", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "ctc_loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
new file mode 100644
index 0000000000..47ee263a8d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
@@ -0,0 +1,177 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+
+import torch
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+    LabelSmoothedCrossEntropyCriterionConfig,
+    label_smoothed_nll_loss,
+)
+
+
+@dataclass
+class RdropLabelSmoothedCrossEntropyCriterionConfig(
+    LabelSmoothedCrossEntropyCriterionConfig
+):
+    rdrop_alpha: float = field(
+        default=0.0,
+        metadata={"help": "alpha for r-drop, 0 means no r-drop"},
+    )
+
+
+@register_criterion(
+    "label_smoothed_cross_entropy_with_rdrop",
+    dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig,
+)
+class RdropLabelSmoothedCrossEntropyCriterion(LabelSmoothedCrossEntropyCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size=0,
+        report_accuracy=False,
+        rdrop_alpha=0.0,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            label_smoothing,
+            ignore_prefix_size=ignore_prefix_size,
+            report_accuracy=report_accuracy,
+        )
+        self.sentence_avg = sentence_avg
+        self.eps = label_smoothing
+        self.ignore_prefix_size = ignore_prefix_size
+        self.report_accuracy = report_accuracy
+        self.rdrop_alpha = rdrop_alpha
+
+    def forward(self, model, sample, reduce=True, net_output=None):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        if net_output is None:
+            if self.rdrop_alpha > 0 and sample["net_input"]["src_tokens"].size(
+                0
+            ) == sample["target"].size(0):
+                sample = duplicate_input(sample)
+            net_output = model(**sample["net_input"])
+        loss, nll_loss, rdrop_kl_loss = self.compute_loss(
+            model, net_output, sample, reduce=reduce
+        )
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, net_output, sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        if self.rdrop_alpha > 0:
+            logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data)
+        return loss, sample_size, logging_output
+
+    def get_lprobs_and_target(self, model, net_output, sample):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        target = model.get_targets(sample, net_output)
+        if self.rdrop_alpha > 0 or target.size(0) != lprobs.size(0):
+            target = torch.cat([target, target.clone()], dim=0)
+
+        if self.ignore_prefix_size > 0:
+            # lprobs: B x T x C
+            lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
+            target = target[:, self.ignore_prefix_size :].contiguous()
+        return lprobs.view(-1, lprobs.size(-1)), target.view(-1)
+
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
+        loss, nll_loss = label_smoothed_nll_loss(
+            lprobs,
+            target,
+            self.eps,
+            ignore_index=self.padding_idx,
+            reduce=reduce,
+        )
+
+        if self.rdrop_alpha > 0:
+            pad_mask = target[: target.size(0) // 2].unsqueeze(-1).eq(self.padding_idx)
+            rdrop_kl_loss = compute_kl_loss(model, net_output, pad_mask)
+            loss += self.rdrop_alpha * rdrop_kl_loss
+        else:
+            rdrop_kl_loss = loss.new_zeros(1)
+        return loss, nll_loss, rdrop_kl_loss
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        super().reduce_metrics(logging_outputs)
+
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        rdrop_kl_loss = utils.item(
+            sum(log.get("rdrop_kl_loss", 0) for log in logging_outputs)
+            / sample_size
+            / math.log(2)
+        )
+        if rdrop_kl_loss > 0:
+            metrics.log_scalar("rdrop_kl_loss", rdrop_kl_loss)
+
+
+def duplicate_input(sample):
+    if "net_input" in sample.keys():
+        sample_input = sample["net_input"]
+    else:
+        sample_input = sample
+
+    for k, v in sample_input.items():
+        if isinstance(v, torch.Tensor):
+            sample_input[k] = torch.cat([v, v.clone()], dim=0)
+    if "net_input" in sample.keys():
+        sample["net_input"] = sample_input
+    else:
+        sample = sample_input
+    return sample
+
+
+def compute_kl_loss(model, net_output, pad_mask=None, reduce=True):
+    net_prob = model.get_normalized_probs(net_output, log_probs=True)
+    net_prob_tec = model.get_normalized_probs(net_output, log_probs=False)
+
+    net_prob = net_prob.view(-1, net_prob.size(-1))
+    net_prob_tec = net_prob_tec.view(-1, net_prob_tec.size(-1))
+
+    p, q = torch.split(net_prob, net_prob.size(0) // 2, dim=0)
+    p_tec, q_tec = torch.split(net_prob_tec, net_prob_tec.size(0) // 2, dim=0)
+
+    p_loss = torch.nn.functional.kl_div(p, q_tec, reduction="none")
+    q_loss = torch.nn.functional.kl_div(q, p_tec, reduction="none")
+
+    if pad_mask is not None:
+        p_loss.masked_fill_(pad_mask, 0.0)
+        q_loss.masked_fill_(pad_mask, 0.0)
+
+    if reduce:
+        p_loss = p_loss.sum()
+        q_loss = q_loss.sum()
+
+    loss = (p_loss + q_loss) / 2
+    return loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
new file mode 100644
index 0000000000..5cf70df2ab
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
@@ -0,0 +1,178 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+def compute_cross_entropy_loss(logits, targets, ignore_index=-100):
+    """
+    Function to compute the cross entropy loss. The default value of
+    ignore_index is the same as the default value for F.cross_entropy in
+    pytorch.
+    """
+    assert logits.size(0) == targets.size(
+        -1
+    ), "Logits and Targets tensor shapes don't match up"
+
+    loss = F.nll_loss(
+        F.log_softmax(logits, -1, dtype=torch.float32),
+        targets,
+        reduction="sum",
+        ignore_index=ignore_index,
+    )
+    return loss
+
+
+@register_criterion("legacy_masked_lm_loss")
+class LegacyMaskedLmLoss(FairseqCriterion):
+    """
+    Implementation for the loss used in masked language model (MLM) training.
+    This optionally also computes the next sentence prediction (NSP) loss and
+    adds it to the overall loss based on the specified args. There are three
+    cases to consider:
+        1) Generic MLM training without NSP loss. In this case sentence_targets
+           and sentence_logits are both None.
+        2) BERT training without NSP loss. In this case sentence_targets is
+           not None but sentence_logits is None and we should not be computing
+           a sentence level loss.
+        3) BERT training with NSP loss. In this case both sentence_targets and
+           sentence_logits are not None and we should be computing a sentence
+           level loss. The weight of the sentence level loss is specified as
+           an argument.
+    """
+
+    def __init__(self, task, masked_lm_only, nsp_loss_weight):
+        super().__init__(task)
+        self.masked_lm_only = masked_lm_only
+        self.nsp_loss_weight = nsp_loss_weight
+
+    @staticmethod
+    def add_args(parser):
+        """Args for MaskedLM Loss"""
+        # Default for masked_lm_only is False so as to not break BERT training
+        parser.add_argument(
+            "--masked-lm-only",
+            default=False,
+            action="store_true",
+            help="compute MLM loss only",
+        )
+        parser.add_argument(
+            "--nsp-loss-weight",
+            default=1.0,
+            type=float,
+            help="weight for next sentence prediction" " loss (default 1)",
+        )
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        lm_logits, output_metadata = model(**sample["net_input"])
+
+        # reshape lm_logits from (N,T,C) to (N*T,C)
+        lm_logits = lm_logits.view(-1, lm_logits.size(-1))
+        lm_targets = sample["lm_target"].view(-1)
+        lm_loss = compute_cross_entropy_loss(lm_logits, lm_targets, self.padding_idx)
+
+        # compute the number of tokens for which loss is computed. This is used
+        # to normalize the loss
+        ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel()
+        loss = lm_loss / ntokens
+        nsentences = sample["nsentences"]
+        # nsentences = 0
+
+        # Compute sentence loss if masked_lm_only is False
+        sentence_loss = None
+        if not self.masked_lm_only:
+            sentence_logits = output_metadata["sentence_logits"]
+            sentence_targets = sample["sentence_target"].view(-1)
+            # This needs to be recomputed due to some differences between
+            # TokenBlock and BlockPair dataset. This can be resolved with a
+            # refactor of BERTModel which we will do in the future.
+            # TODO: Remove this after refactor of BERTModel
+            nsentences = sentence_targets.size(0)
+
+            # Check for logits being none which can happen when remove_heads
+            # is set to true in the BERT model. Ideally we should set
+            # masked_lm_only to true in this case, but that requires some
+            # refactor in the BERT model.
+            if sentence_logits is not None:
+                sentence_loss = compute_cross_entropy_loss(
+                    sentence_logits, sentence_targets
+                )
+
+                loss += self.nsp_loss_weight * (sentence_loss / nsentences)
+
+        # NOTE: as we are summing up per token mlm loss and per sentence nsp loss
+        # we don't need to use sample_size as denominator for the gradient
+        # here sample_size is just used for logging
+        sample_size = 1
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "lm_loss": utils.item(lm_loss.data) if reduce else lm_loss.data,
+            # sentence loss is not always computed
+            "sentence_loss": (
+                (utils.item(sentence_loss.data) if reduce else sentence_loss.data)
+                if sentence_loss is not None
+                else 0.0
+            ),
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        lm_loss_sum = sum(log.get("lm_loss", 0) for log in logging_outputs)
+        sentence_loss_sum = sum(log.get("sentence_loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_loss = sum(log.get("loss", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss",
+            agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            sample_size,
+            round=3,
+        )
+        metrics.log_scalar(
+            "lm_loss",
+            lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0,
+            ntokens,
+            round=3,
+        )
+        metrics.log_scalar(
+            "sentence_loss",
+            sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.0,
+            nsentences,
+            round=3,
+        )
+        metrics.log_scalar(
+            "nll_loss",
+            lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0,
+            ntokens,
+            round=3,
+        )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
new file mode 100644
index 0000000000..09ddd9f3e6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+import math
+from omegaconf import II
+
+import torch
+from fairseq import modules, utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class MaskedLmConfig(FairseqDataclass):
+    tpu: bool = II("common.tpu")
+
+
+@register_criterion("masked_lm", dataclass=MaskedLmConfig)
+class MaskedLmLoss(FairseqCriterion):
+    """
+    Implementation for the loss used in masked language model (MLM) training.
+    """
+
+    def __init__(self, cfg: MaskedLmConfig, task):
+        super().__init__(task)
+        self.tpu = cfg.tpu
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        masked_tokens = sample["target"].ne(self.padding_idx)
+        sample_size = masked_tokens.int().sum()
+
+        # Rare: when all tokens are masked, project all tokens.
+        # We use torch.where to avoid device-to-host transfers,
+        # except on CPU where torch.where is not well supported
+        # (see github.com/pytorch/pytorch/issues/26247).
+        if self.tpu:
+            masked_tokens = None  # always project all tokens on TPU
+        elif masked_tokens.device == torch.device("cpu"):
+            if not masked_tokens.any():
+                masked_tokens = None
+        else:
+            masked_tokens = torch.where(
+                masked_tokens.any(),
+                masked_tokens,
+                masked_tokens.new([True]),
+            )
+
+        logits = model(**sample["net_input"], masked_tokens=masked_tokens)[0]
+        targets = model.get_targets(sample, [logits])
+        if masked_tokens is not None:
+            targets = targets[masked_tokens]
+
+        loss = modules.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            targets.view(-1),
+            reduction="sum",
+            ignore_index=self.padding_idx,
+        )
+
+        logging_output = {
+            "loss": loss if self.tpu else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_derived(
+            "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+        )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
new file mode 100644
index 0000000000..4c020ddbd2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
@@ -0,0 +1,177 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+import torch
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.logging.meters import safe_round
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelCriterionConfig(FairseqDataclass):
+    loss_weights: Dict[str, float] = field(
+        default_factory=dict,
+        metadata={"help": "weights for the loss terms"},
+    )
+    log_keys: List[str] = field(
+        default_factory=list,
+        metadata={"help": "additional output keys to log"},
+    )
+    can_sum: bool = True
+
+
+@register_criterion("model", dataclass=ModelCriterionConfig)
+class ModelCriterion(FairseqCriterion):
+    """
+    This criterion relies on the model to supply losses.
+    The losses should be a dictionary of name -> scalar returned by
+    the model either by including it in the net_output dict or by
+    implementing a get_losses(net_output, sample) method. The final loss is
+    a scaled sum of all losses according to weights in loss_weights.
+    If no weights are provided, then all losses are scaled by 1.0.
+
+    The losses will be automatically logged. Additional keys from
+    net_output dict can be logged via the log_keys parameter.
+    """
+
+    def __init__(self, task, loss_weights=None, log_keys=None, can_sum=True):
+        super().__init__(task)
+        self.loss_weights = loss_weights
+        self.log_keys = log_keys
+        self.can_sum = can_sum
+
+    def forward(self, model, sample, reduce=True):
+        net_output = model(**sample["net_input"])
+
+        scaled_losses = {}
+
+        if hasattr(model, "get_losses"):
+            losses = model.get_losses(net_output, sample)
+        elif isinstance(net_output, dict) and "losses" in net_output:
+            losses = net_output["losses"]
+        else:
+            raise Exception("Could not retrieve losses")
+
+        for lk, p in losses.items():
+            try:
+                coef = 1.0 if len(self.loss_weights) == 0 else self.loss_weights[lk]
+            except KeyError:
+                logger.error(
+                    f"weight for loss {lk} is not in loss_weights ({self.loss_weights})"
+                )
+                raise
+            if coef != 0 and p is not None:
+                scaled_losses[lk] = coef * p.float().sum()
+
+        loss = sum(scaled_losses.values())
+
+        if "sample_size" in net_output:
+            sample_size = net_output["sample_size"]
+        else:
+            sample_size = loss.numel()
+
+        if reduce and loss.numel() > 1:
+            loss = loss.sum()
+
+        logging_output = {
+            "loss": loss.data,
+            "ntokens": sample_size,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+            "_world_size": 1,
+        }
+
+        for lk in self.log_keys:
+            if lk in net_output and net_output[lk] is not None:
+                if not torch.is_tensor(net_output[lk]) or net_output[lk].numel() == 1:
+                    logging_output[lk] = float(net_output[lk])
+                elif lk.startswith("_"):
+                    logging_output[lk] = net_output[lk]
+                else:
+                    for i, v in enumerate(net_output[lk]):
+                        logging_output[f"{lk}_{i}"] = float(v)
+
+        if len(scaled_losses) > 1:
+            for lk, l in scaled_losses.items():
+                if l.numel() > 1:
+                    l = l.sum()
+                logging_output[f"loss_{lk}"] = l.item()
+
+        if "logs" in net_output:
+            for lgw in net_output["logs"]:
+                logging_output[lgw] = net_output["logs"][lgw]
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        nsentences = utils.item(
+            sum(log.get("nsentences", 0) for log in logging_outputs)
+        )
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+
+        metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3)
+        metrics.log_scalar("ntokens", ntokens)
+        metrics.log_scalar("nsentences", nsentences)
+        metrics.log_scalar("sample_size", sample_size)
+
+        builtin_keys = {
+            "loss",
+            "ntokens",
+            "nsentences",
+            "sample_size",
+            "_world_size",
+        }
+
+        world_size = utils.item(
+            sum(log.get("_world_size", 0) for log in logging_outputs)
+        )
+
+        for k in logging_outputs[0]:
+            if k not in builtin_keys and not k.startswith("_"):
+                val = sum(log.get(k, 0) for log in logging_outputs)
+                if k.startswith("loss_"):
+                    metrics.log_scalar(k, val / sample_size, sample_size, round=3)
+                else:
+                    metrics.log_scalar(k, val / world_size, round=3)
+
+        correct = sum(log.get("correct", 0) for log in logging_outputs)
+        total = sum(log.get("count", 0) for log in logging_outputs)
+
+        if total > 0:
+            metrics.log_scalar("_correct", correct)
+            metrics.log_scalar("_total", total)
+
+            metrics.log_derived(
+                "accuracy",
+                lambda meters: safe_round(
+                    meters["_correct"].sum / meters["_total"].sum, 5
+                )
+                if meters["_total"].sum > 0
+                else float("nan"),
+            )
+
+    def logging_outputs_can_be_summed(self) -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return self.can_sum
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
new file mode 100644
index 0000000000..fc0bdaf851
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
@@ -0,0 +1,181 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from torch import Tensor
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class LabelSmoothedDualImitationCriterionConfig(FairseqDataclass):
+    label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"},
+    )
+
+
+@register_criterion("nat_loss", dataclass=LabelSmoothedDualImitationCriterionConfig)
+class LabelSmoothedDualImitationCriterion(FairseqCriterion):
+    def __init__(self, task, label_smoothing):
+        super().__init__(task)
+        self.label_smoothing = label_smoothing
+
+    def _compute_loss(
+        self, outputs, targets, masks=None, label_smoothing=0.0, name="loss", factor=1.0
+    ):
+        """
+        outputs: batch x len x d_model
+        targets: batch x len
+        masks:   batch x len
+
+        policy_logprob: if there is some policy
+            depends on the likelihood score as rewards.
+        """
+
+        def mean_ds(x: Tensor, dim=None) -> Tensor:
+            return (
+                x.float().mean().type_as(x)
+                if dim is None
+                else x.float().mean(dim).type_as(x)
+            )
+
+        if masks is not None:
+            outputs, targets = outputs[masks], targets[masks]
+
+        if masks is not None and not masks.any():
+            nll_loss = torch.tensor(0)
+            loss = nll_loss
+        else:
+            logits = F.log_softmax(outputs, dim=-1)
+            if targets.dim() == 1:
+                losses = F.nll_loss(logits, targets.to(logits.device), reduction="none")
+
+            else:  # soft-labels
+                losses = F.kl_div(logits, targets.to(logits.device), reduction="none")
+                losses = losses.sum(-1)
+
+            nll_loss = mean_ds(losses)
+            if label_smoothing > 0:
+                loss = (
+                    nll_loss * (1 - label_smoothing) - mean_ds(logits) * label_smoothing
+                )
+            else:
+                loss = nll_loss
+
+        loss = loss * factor
+        return {"name": name, "loss": loss, "nll_loss": nll_loss, "factor": factor}
+
+    def _custom_loss(self, loss, name="loss", factor=1.0):
+        return {"name": name, "loss": loss, "factor": factor}
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        nsentences, ntokens = sample["nsentences"], sample["ntokens"]
+
+        # B x T
+        src_tokens, src_lengths = (
+            sample["net_input"]["src_tokens"],
+            sample["net_input"]["src_lengths"],
+        )
+        tgt_tokens, prev_output_tokens = sample["target"], sample["prev_target"]
+
+        outputs = model(src_tokens, src_lengths, prev_output_tokens, tgt_tokens)
+        losses, nll_loss = [], []
+
+        for obj in outputs:
+            if outputs[obj].get("loss", None) is None:
+                _losses = self._compute_loss(
+                    outputs[obj].get("out"),
+                    outputs[obj].get("tgt"),
+                    outputs[obj].get("mask", None),
+                    outputs[obj].get("ls", 0.0),
+                    name=obj + "-loss",
+                    factor=outputs[obj].get("factor", 1.0),
+                )
+            else:
+                _losses = self._custom_loss(
+                    outputs[obj].get("loss"),
+                    name=obj + "-loss",
+                    factor=outputs[obj].get("factor", 1.0),
+                )
+
+            losses += [_losses]
+            if outputs[obj].get("nll_loss", False):
+                nll_loss += [_losses.get("nll_loss", 0.0)]
+
+        loss = sum(l["loss"] for l in losses)
+        nll_loss = sum(l for l in nll_loss) if len(nll_loss) > 0 else loss.new_tensor(0)
+
+        # NOTE:
+        # we don't need to use sample_size as denominator for the gradient
+        # here sample_size is just used for logging
+        sample_size = 1
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+
+        for l in losses:
+            logging_output[l["name"]] = (
+                utils.item(l["loss"].data / l["factor"])
+                if reduce
+                else l[["loss"]].data / l["factor"]
+            )
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+        loss = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        nll_loss = utils.item(sum(log.get("nll_loss", 0) for log in logging_outputs))
+
+        metrics.log_scalar(
+            "loss", loss / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar(
+            "nll_loss", nll_loss / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_derived(
+            "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+        )
+
+        for key in logging_outputs[0]:
+            if key[-5:] == "-loss":
+                val = sum(log.get(key, 0) for log in logging_outputs)
+                metrics.log_scalar(
+                    key[:-5],
+                    val / sample_size / math.log(2) if sample_size > 0 else 0.0,
+                    sample_size,
+                    round=3,
+                )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
new file mode 100644
index 0000000000..298b805768
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
@@ -0,0 +1,288 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+from itertools import chain
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sklearn.metrics import f1_score
+from sklearn.metrics import matthews_corrcoef as _matthews_corrcoef
+from scipy.stats import pearsonr, spearmanr
+
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.logging.meters import safe_round
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def matthews_corrcoef(preds, labels):
+    # make it consistent with other metrics taking (preds, labels) as input
+    mcc = _matthews_corrcoef(labels, preds)
+    return mcc
+
+
+@dataclass
+class SentencePredictionConfig(FairseqDataclass):
+    classification_head_name: str = field(
+        default="sentence_classification_head",
+        metadata={"help": "name of the classification head to use"},
+    )
+    regression_target: bool = field(
+        default=False,
+    )
+    report_mcc: bool = False
+    report_acc_and_f1: bool = False
+    report_pearson_and_spearman: bool = False
+
+
+@register_criterion("sentence_prediction", dataclass=SentencePredictionConfig)
+class SentencePredictionCriterion(FairseqCriterion):
+    def __init__(self, cfg: SentencePredictionConfig, task):
+        super().__init__(task)
+        self.classification_head_name = cfg.classification_head_name
+        self.regression_target = cfg.regression_target
+        self.keep_pred_and_targ = (
+            cfg.report_mcc or cfg.report_acc_and_f1 or cfg.report_pearson_and_spearman
+        )
+        self.report_mcc = cfg.report_mcc
+        self.report_acc_and_f1 = cfg.report_acc_and_f1
+        self.report_pearson_and_spearman = cfg.report_pearson_and_spearman
+        self.label_dict = task.label_dictionary
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        assert (
+            hasattr(model, "classification_heads")
+            and self.classification_head_name in model.classification_heads
+        ), "model must provide sentence classification head for --criterion=sentence_prediction"
+
+        logits, _ = model(
+            **sample["net_input"],
+            features_only=True,
+            classification_head_name=self.classification_head_name,
+        )
+        targets = model.get_targets(sample, [logits]).view(-1)
+        sample_size = targets.numel()
+
+        if not self.regression_target:
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            task_loss = F.nll_loss(lprobs, targets, reduction="sum")
+        else:
+            logits = logits.view(-1).float()
+            targets = targets.float()
+            task_loss = F.mse_loss(logits, targets, reduction="sum")
+
+        logging_output = {}
+        loss = task_loss
+        # mha & ffn regularization update
+        if (
+            hasattr(model, "args")
+            and hasattr(model.args, "mha_reg_scale_factor")
+            and model.args.mha_reg_scale_factor != 0.0
+        ):
+            mha_reg_loss = model._get_adaptive_head_loss()
+            loss += mha_reg_loss
+            logging_output.update({"mha_reg_loss": mha_reg_loss})
+        if (
+            hasattr(model, "args")
+            and hasattr(model.args, "ffn_reg_scale_factor")
+            and model.args.ffn_reg_scale_factor != 0.0
+        ):
+            ffn_reg_loss = model._get_adaptive_ffn_loss()
+            loss += ffn_reg_loss
+            logging_output.update({"ffn_reg_loss": ffn_reg_loss})
+
+        logging_output.update(
+            {
+                "loss": loss.data,
+                "ntokens": sample["ntokens"],
+                "nsentences": sample_size,
+                "sample_size": sample_size,
+            }
+        )
+        if not self.regression_target:
+            preds = logits.argmax(dim=1)
+            logging_output["ncorrect"] = (preds == targets).sum()
+        if self.keep_pred_and_targ and not model.training:
+            if self.regression_target:
+                logging_output["pred"] = logits.detach().cpu().tolist()
+                logging_output["targ"] = targets.detach().cpu().tolist()
+            else:
+                # remove offset `self.label_dict.nspecial` from OffsetTokensDataset
+                preds = self.label_dict.string(preds + self.label_dict.nspecial).split()
+                targets = self.label_dict.string(
+                    targets + self.label_dict.nspecial
+                ).split()
+                logging_output["pred"] = list(map(int, preds))
+                logging_output["targ"] = list(map(int, targets))
+
+            if self.report_mcc:
+                logging_output["report_mcc"] = True
+            if self.report_acc_and_f1:
+                logging_output["report_acc_and_f1"] = True
+            if self.report_pearson_and_spearman:
+                logging_output["report_pearson_and_spearman"] = True
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        mha_reg_loss_sum = sum(log.get("mha_reg_loss", 0) for log in logging_outputs)
+        ffn_reg_loss_sum = sum(log.get("ffn_reg_loss", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        if mha_reg_loss_sum:
+            metrics.log_scalar(
+                "mha_reg_loss",
+                mha_reg_loss_sum / sample_size / math.log(2),
+                sample_size,
+                round=3,
+            )
+        if ffn_reg_loss_sum:
+            metrics.log_scalar(
+                "ffn_reg_loss",
+                ffn_reg_loss_sum / sample_size / math.log(2),
+                sample_size,
+                round=3,
+            )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+
+        if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
+            ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+            metrics.log_scalar(
+                "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1
+            )
+
+        # Metrics used by GLUE
+        pred = np.array(
+            list(chain.from_iterable(log.get("pred", []) for log in logging_outputs))
+        )
+        targ = np.array(
+            list(chain.from_iterable(log.get("targ", []) for log in logging_outputs))
+        )
+        if len(pred):
+            metrics.log_concat_tensor("pred", torch.from_numpy(pred), dim=0)
+            metrics.log_concat_tensor("targ", torch.from_numpy(targ), dim=0)
+            if any("report_mcc" in log for log in logging_outputs):
+                metrics.log_derived(
+                    "mcc",
+                    lambda meters: safe_round(
+                        matthews_corrcoef(
+                            meters["pred"].tensor.numpy(),
+                            meters["targ"].tensor.numpy(),
+                        )
+                        * 100,
+                        1,
+                    ),
+                )
+            if any("report_acc_and_f1" in log for log in logging_outputs):
+                metrics.log_derived(
+                    "acc_and_f1",
+                    lambda meters: safe_round(
+                        acc_and_f1(
+                            meters["pred"].tensor.numpy(),
+                            meters["targ"].tensor.numpy(),
+                        )["acc_and_f1"]
+                        * 100,
+                        1,
+                    ),
+                )
+                metrics.log_derived(
+                    "f1",
+                    lambda meters: safe_round(
+                        acc_and_f1(
+                            meters["pred"].tensor.numpy(),
+                            meters["targ"].tensor.numpy(),
+                        )["f1"]
+                        * 100,
+                        1,
+                    ),
+                )
+            if any("report_pearson_and_spearman" in log for log in logging_outputs):
+                metrics.log_derived(
+                    "pearson_and_spearman",
+                    lambda meters: safe_round(
+                        pearson_and_spearman(
+                            meters["pred"].tensor.numpy(),
+                            meters["targ"].tensor.numpy(),
+                        )["corr"]
+                        * 100,
+                        1,
+                    ),
+                )
+                metrics.log_derived(
+                    "pearson",
+                    lambda meters: safe_round(
+                        pearson_and_spearman(
+                            meters["pred"].tensor.numpy(),
+                            meters["targ"].tensor.numpy(),
+                        )["pearson"]
+                        * 100,
+                        1,
+                    ),
+                )
+                metrics.log_derived(
+                    "spearman",
+                    lambda meters: safe_round(
+                        pearson_and_spearman(
+                            meters["pred"].tensor.numpy(),
+                            meters["targ"].tensor.numpy(),
+                        )["spearmanr"]
+                        * 100,
+                        1,
+                    ),
+                )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
new file mode 100644
index 0000000000..8a873a45b3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from fairseq.criterions import register_criterion
+from fairseq.criterions.sentence_prediction import (
+    SentencePredictionCriterion,
+    SentencePredictionConfig,
+)
+
+
+@register_criterion("sentence_prediction_adapters", dataclass=SentencePredictionConfig)
+class SentencePredictionCriterionAdapters(SentencePredictionCriterion):
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        assert (
+            hasattr(model, "classification_heads")
+            and self.classification_head_name in model.classification_heads
+        ), "model must provide sentence classification head for --criterion=sentence_prediction"
+
+        if not hasattr(sample, "lang_id"):
+            # If no language ID is given, we fall back to English
+            lang_id = ["en_XX"] * sample["nsentences"]
+        else:
+            lang_id = sample["lang_id"]
+
+        logits, _ = model(
+            **sample["net_input"],
+            features_only=True,
+            classification_head_name=self.classification_head_name,
+            lang_id=lang_id,
+        )
+        targets = model.get_targets(sample, [logits]).view(-1)
+        sample_size = targets.numel()
+
+        if not self.regression_target:
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            loss = F.nll_loss(lprobs, targets, reduction="sum")
+        else:
+            logits = logits.view(-1).float()
+            targets = targets.float()
+            loss = F.mse_loss(logits, targets, reduction="sum")
+
+        logging_output = {
+            "loss": loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample_size,
+            "sample_size": sample_size,
+        }
+        if not self.regression_target:
+            preds = logits.argmax(dim=1)
+            logging_output["ncorrect"] = (preds == targets).sum()
+
+        return loss, sample_size, logging_output
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
new file mode 100644
index 0000000000..bfb9f058f9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("sentence_ranking")
+class SentenceRankingCriterion(FairseqCriterion):
+    def __init__(self, task, ranking_head_name, save_predictions, num_classes):
+        super().__init__(task)
+        self.ranking_head_name = ranking_head_name
+        if save_predictions is not None:
+            self.prediction_h = open(save_predictions, "w")
+        else:
+            self.prediction_h = None
+        self.num_classes = num_classes
+
+    def __del__(self):
+        if self.prediction_h is not None:
+            self.prediction_h.close()
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--save-predictions', metavar='FILE',
+                            help='file to save predictions to')
+        parser.add_argument('--ranking-head-name',
+                            default='sentence_classification_head',
+                            help='name of the ranking head to use')
+        # fmt: on
+
+    def forward(self, model, sample, reduce=True):
+        """Compute ranking loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        assert (
+            hasattr(model, "classification_heads")
+            and self.ranking_head_name in model.classification_heads
+        ), "model must provide sentence ranking head for --criterion=sentence_ranking"
+
+        scores = []
+        for idx in range(self.num_classes):
+            score, _ = model(
+                **sample["net_input{idx}".format(idx=idx + 1)],
+                classification_head_name=self.ranking_head_name,
+            )
+            scores.append(score)
+
+        logits = torch.cat(scores, dim=1)
+        sample_size = logits.size(0)
+
+        if "target" in sample:
+            targets = model.get_targets(sample, [logits]).view(-1)
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+            loss = F.nll_loss(lprobs, targets, reduction="sum")
+        else:
+            targets = None
+            loss = torch.tensor(0.0, requires_grad=True)
+
+        if self.prediction_h is not None:
+            preds = logits.argmax(dim=1)
+            for i, (id, pred) in enumerate(zip(sample["id"].tolist(), preds.tolist())):
+                if targets is not None:
+                    label = targets[i].item()
+                    print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h)
+                else:
+                    print("{}\t{}".format(id, pred), file=self.prediction_h)
+
+        logging_output = {
+            "loss": loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample_size,
+            "sample_size": sample_size,
+        }
+        if targets is not None:
+            logging_output["ncorrect"] = (logits.argmax(dim=1) == targets).sum()
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+
+        if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
+            ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+            metrics.log_scalar(
+                "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
new file mode 100644
index 0000000000..8888180114
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
@@ -0,0 +1,335 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II
+
+
+@dataclass
+class SpeechDLMCriterionConfig(FairseqDataclass):
+    sentence_avg: bool = II("optimization.sentence_avg")
+    main_and_cross_weights: Optional[str] = field(
+        default="1,0",
+        metadata={
+            "help": "Comma-separated list of weights of Main-channel vs Cross-channel Prediction Losses"
+            "(default: 1,0)"
+        },
+    )
+    general_unit_loss_weight: float = field(
+        default=0,
+        metadata={
+            "help": "The weight of the General Prediction Loss (Next-step Unit Prediction Loss)"
+            "(default: 0)"
+        },
+    )
+    edge_unit_loss_weight: float = field(
+        default=1,
+        metadata={"help": "The weight of the Edge Unit Prediction Loss" "(default: 1)"},
+    )
+    duration_loss_weight: float = field(
+        default=1,
+        metadata={
+            "help": "The weight of the Edge Unit Duration Prediction Loss"
+            "(default: 1)"
+        },
+    )
+
+
+@register_criterion("speech_dlm_criterion", dataclass=SpeechDLMCriterionConfig)
+class SpeechDLMCriterion(FairseqCriterion):
+    """Criteron for the SpeechDLM model as described in the paper:
+    https://arxiv.org/pdf/2203.16502.pdf
+
+    There are 3 possible losses depending on the targets of the model:
+        - general_unit_loss : The next unit prediction loss, corresponding to
+            'next' target
+        - edge_unit_loss : The edge unit prediction loss, corresponding to
+            'edge' target
+        - duration_loss : The duration prediction loss, corresponding to
+            'duration' target
+    """
+
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        main_and_cross_weights,
+        general_unit_loss_weight,
+        edge_unit_loss_weight,
+        duration_loss_weight,
+    ):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+
+        self.channels = task.channels
+        self.targets = task.targets
+        self.delayed_duration_target = task.delayed_duration_target
+
+        self.main_channel_weight = float(main_and_cross_weights.split(",")[0])
+        self.cross_channel_weight = float(main_and_cross_weights.split(",")[1])
+        assert self.main_channel_weight >= 0 and self.cross_channel_weight >= 0
+
+        self.channel_weights = {
+            channel: weight
+            for channel, weight in zip(self.channels, task.channel_weights)
+        }
+
+        self.target_weights = {}
+        for t in self.targets:
+            if t == "next":
+                self.target_weights[t] = general_unit_loss_weight
+                assert (
+                    general_unit_loss_weight > 0
+                ), "Expect a positive --general-unit-loss-weight for next unit prediction"
+            elif t == "edge":
+                self.target_weights[t] = edge_unit_loss_weight
+                assert (
+                    edge_unit_loss_weight > 0
+                ), "Expect a positive --edge-unit-loss-weight for edge unit prediction"
+            elif t == "duration":
+                self.target_weights[t] = duration_loss_weight
+                assert (
+                    duration_loss_weight > 0
+                ), "Expect a positive --duration-loss-weight for duration prediction"
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        loss_dict, stats_dict = self.compute_loss(
+            model, net_output, sample, reduce=reduce
+        )
+        nsentences = sample["net_input"]["src_tokens"][self.channels[0]].size(0)
+
+        logging_output = {
+            "nsentences": nsentences,
+        }
+        logging_output["nsentences"] = nsentences
+
+        loss_all = {t: 0 for t in self.targets}
+        correct_all = {t: 0 for t in self.targets}
+        count_all = {t: 0 for t in self.targets}
+        ntokens_all = 0
+        sample_size_all = 0
+        for channel in loss_dict:
+            for pred_channel in loss_dict[channel]:
+                # Get ntokens & sample_size
+                ntokens = sample["net_input"]["src_tokens"][channel].numel()
+                sample_size = nsentences if self.sentence_avg else ntokens
+                prefix = "[{}-{}]".format(channel, pred_channel)
+                log_keys = {
+                    "next": "general_token",
+                    "edge": "edge_token",
+                    "duration": "edge_duration",
+                }
+
+                # Log & Update the sizes
+                logging_output["{}ntokens".format(prefix)] = ntokens
+                logging_output["{}sample_size".format(prefix)] = sample_size
+                ntokens_all += ntokens
+                sample_size_all += sample_size
+
+                for t in self.targets:
+                    log_key = log_keys[t]
+                    loss = loss_dict[channel][pred_channel][t]
+                    correct, count = stats_dict[channel][pred_channel][t]
+
+                    # Log the statistics
+                    logging_output["{}{}_loss".format(prefix, log_key)] = loss.data
+                    logging_output["{}{}_correct".format(prefix, log_key)] = correct
+                    logging_output["{}{}_count".format(prefix, log_key)] = count
+
+                    # Scale the training loss by weights
+                    target_loss = loss * self.channel_weights[channel]
+                    if pred_channel == channel:
+                        target_loss = target_loss * self.main_channel_weight
+                    else:
+                        target_loss = target_loss * self.cross_channel_weight
+                    # Normalize the losses in the training by the number of edges
+                    if t in ["edge", "duration"]:
+                        target_loss = target_loss / count * sample_size
+
+                    # Update the statistics
+                    loss_all[t] += target_loss
+                    correct_all[t] += correct
+                    count_all[t] += count
+
+        # Logging the average statistics
+        logging_output["ntokens"] = ntokens_all
+        logging_output["sample_size"] = sample_size_all
+        for t in self.targets:
+            log_key = {
+                "next": "general_token",
+                "edge": "edge_token",
+                "duration": "edge_duration",
+            }[t]
+            logging_output["{}_loss".format(log_key)] = loss_all[t].data
+            logging_output["{}_correct".format(log_key)] = correct_all[t]
+            logging_output["{}_count".format(log_key)] = count_all[t]
+
+        # Define the training loss
+        training_loss = 0
+        for t in self.targets:
+            training_loss += loss_all[t] * self.target_weights[t]
+        logging_output["loss"] = training_loss.data
+
+        return training_loss, sample_size_all, logging_output
+
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        # Get the model outputs and target
+        lprobs_dict = model.get_normalized_probs(net_output, log_probs=True)
+        target_dict = model.get_targets(sample, net_output)
+
+        # Init the dictionaries
+        loss_dict, stats_dict = {}, {}
+
+        for channel in lprobs_dict:
+            # Init the dictionaries
+            loss_dict[channel], stats_dict[channel] = {}, {}
+
+            for pred_channel in lprobs_dict[channel]:
+                # Init the dictionaries
+                loss_dict[channel][pred_channel] = {}
+                stats_dict[channel][pred_channel] = {}
+
+                # Get token & duration predictions
+                outputs = lprobs_dict[channel][pred_channel]
+                if not isinstance(outputs, dict):
+                    token_lprobs = outputs
+                else:
+                    token_lprobs = outputs["pred_token"]
+                    dur_preds = outputs["pred_duration"]
+                    dur_preds = dur_preds.view(-1)
+                token_lprobs = token_lprobs.view(-1, token_lprobs.size(-1))
+                token_preds = token_lprobs.argmax(dim=-1)
+
+                # Get edge indices
+                if "edge" in self.targets or "duration" in self.targets:
+                    edge_indices = target_dict["edge_indices"][pred_channel]
+
+                # Compute loss and statistics
+                for t in self.targets:
+                    if t in ["next", "edge"]:
+                        if t == "next":
+                            target = target_dict["next"][pred_channel].view(-1)
+                            lprobs = token_lprobs
+                            preds = token_preds
+                        elif t == "edge":
+                            target = target_dict["edge"][pred_channel]
+                            lprobs = token_lprobs[edge_indices]
+                            preds = token_preds[edge_indices]
+
+                        loss = F.nll_loss(
+                            lprobs,
+                            target,
+                            ignore_index=self.padding_idx,
+                            reduction="sum" if reduce else "none",
+                        )
+                    elif t == "duration":
+                        target = target_dict["duration"][pred_channel]
+                        if self.delayed_duration_target:
+                            duration_indices = edge_indices + 1
+                            if duration_indices[-1] == len(dur_preds):
+                                duration_indices = duration_indices[:-1]
+                                target = target[:-1]
+                        else:
+                            duration_indices = edge_indices
+                        preds = dur_preds[duration_indices]
+
+                        loss = F.l1_loss(
+                            preds,
+                            target,
+                            reduction="sum" if reduce else "none",
+                        )
+                        preds = preds.round()
+
+                    correct = (preds == target).sum().float().cpu().item()
+                    count = float(target.size(0))
+
+                    loss_dict[channel][pred_channel][t] = loss
+                    stats_dict[channel][pred_channel][t] = (correct, count)
+
+        return loss_dict, stats_dict
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        logging_keys = next(iter(logging_outputs)).keys()
+        channels = [item[:-7] for item in logging_keys if item.endswith("ntokens")]
+        target_prefixes = set(
+            [
+                item[:-5].split("]")[-1]
+                for item in logging_keys
+                if item.endswith("_loss")
+            ]
+        )
+        for channel_prefix in channels:
+            for target_prefix in target_prefixes:
+                prefix = "{}{}".format(channel_prefix, target_prefix)
+                count_sum = sum(
+                    log.get("{}_count".format(prefix), 0) for log in logging_outputs
+                )
+                correct_sum = sum(
+                    log.get("{}_correct".format(prefix), 0) for log in logging_outputs
+                )
+                loss_sum = sum(
+                    log.get("{}_loss".format(prefix), 0) for log in logging_outputs
+                )
+
+                if "duration" not in target_prefix:
+                    # we divide by log(2) to convert the loss from base e to base 2
+                    metrics.log_scalar(
+                        "{}_loss".format(prefix),
+                        loss_sum / count_sum / math.log(2),
+                        count_sum,
+                        round=3,
+                    )
+                    metrics.log_derived(
+                        "{}_ppl".format(prefix),
+                        lambda meters, prefix=prefix: utils.get_perplexity(
+                            meters["{}_loss".format(prefix)].avg
+                        ),
+                    )
+                else:
+                    # for duration we don't need to divide by log(2)
+                    metrics.log_scalar(
+                        "{}_loss".format(prefix),
+                        loss_sum / count_sum,
+                        count_sum,
+                        round=3,
+                    )
+
+                accuracy = 100 * correct_sum / count_sum
+                metrics.log_scalar("{}_pred_acc".format(prefix), accuracy, round=3)
+
+        # Logging training loss
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+
+        # we divide by log(2) to convert the loss from base e to base 2
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
new file mode 100644
index 0000000000..06a8252140
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
@@ -0,0 +1,517 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from collections import OrderedDict
+
+import torch
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import register_criterion
+from fairseq.criterions.ctc import CtcCriterion
+from fairseq.criterions.label_smoothed_cross_entropy_with_rdrop import (
+    RdropLabelSmoothedCrossEntropyCriterion,
+    RdropLabelSmoothedCrossEntropyCriterionConfig,
+    duplicate_input,
+)
+from fairseq.criterions.tacotron2_loss import (
+    Tacotron2Criterion,
+    Tacotron2CriterionConfig,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class MultitaskCriterion:
+    def __init__(self, multitask_tasks, rdrop_alpha=0.0):
+        self.rdrop_alpha = rdrop_alpha
+        self.rdrop_alpha_mtl = rdrop_alpha
+
+        self.multitask_criterion = OrderedDict()
+        self.multitask_loss_weight = OrderedDict()
+        for task_name, task_obj in multitask_tasks.items():
+            if task_obj.args.get_loss_weight(0) == 0:
+                logger.info(f"Skip {task_name} loss criterion")
+                continue
+
+            rdrop_alpha_task = task_obj.args.rdrop_alpha
+            if rdrop_alpha_task is None:
+                rdrop_alpha_task = rdrop_alpha
+            self.rdrop_alpha_mtl = rdrop_alpha_task
+            logger.info(f"rdrop_alpha is set to {rdrop_alpha_task} for {task_name}")
+
+            if task_obj.args.decoder_type == "ctc":
+                self.multitask_criterion[task_name] = CtcCriterion(
+                    task_obj.args.criterion_cfg,
+                    task_obj,
+                    rdrop_alpha=rdrop_alpha_task,
+                )
+            else:
+                self.multitask_criterion[
+                    task_name
+                ] = RdropLabelSmoothedCrossEntropyCriterion(
+                    task_obj,
+                    task_obj.args.criterion_cfg.sentence_avg,
+                    label_smoothing=task_obj.args.criterion_cfg.label_smoothing,
+                    rdrop_alpha=rdrop_alpha_task,
+                )
+
+    def set_multitask_loss_weight(self, task_name, weight=0.0):
+        self.multitask_loss_weight[task_name] = weight
+
+    def get_multitask_loss(self, model, sample, model_out):
+        logging_output = {}
+        loss = 0.0
+        for task_name, task_criterion in self.multitask_criterion.items():
+            layer_id = task_criterion.task.args.input_layer
+            if isinstance(task_criterion, CtcCriterion):
+                if task_criterion.task.args.input_from == "encoder":
+                    if len(model_out["encoder_padding_mask"]) > 0:
+                        non_padding_mask = ~model_out["encoder_padding_mask"][0]
+                        input_lengths = non_padding_mask.long().sum(-1)
+                    else:
+                        out = model_out["encoder_states"][layer_id]
+                        input_lengths = out.new_full(
+                            (out.shape[1],), out.shape[0]
+                        ).long()
+
+                    task_sample = {
+                        "net_input": {
+                            "src_tokens": model_out["encoder_states"][
+                                layer_id
+                            ],  # check batch idx
+                            "src_lengths": input_lengths,
+                        },
+                        "id": sample["id"],
+                    }
+                else:
+                    task_sample = {
+                        "net_input": {
+                            "src_tokens": model_out["inner_states"][layer_id],
+                            "src_lengths": sample["target_lengths"],
+                        },
+                        "id": sample["id"],
+                    }
+            else:
+                task_sample = {
+                    "net_input": {
+                        "src_tokens": sample["multitask"][task_name]["net_input"][
+                            "prev_output_tokens"
+                        ],
+                        "encoder_out": {
+                            "encoder_out": [model_out["encoder_states"][layer_id]],
+                            "encoder_padding_mask": model_out["encoder_padding_mask"],
+                        },
+                    }
+                }
+
+            for key in ["target", "target_lengths", "ntokens"]:
+                task_sample[key] = sample["multitask"][task_name][key]
+
+            if task_name == getattr(model, "mt_task_name", None):
+                decoder_out = model_out["mt_decoder_out"]
+            else:
+                decoder_out = None
+            task_loss, task_sample_size, task_logging_output = task_criterion(
+                model.multitask_decoders[task_name], task_sample, net_output=decoder_out
+            )
+
+            loss = loss + self.multitask_loss_weight[task_name] * task_loss
+            task_logging_output["loss_weight"] = self.multitask_loss_weight[task_name]
+            logging_output[task_name] = task_logging_output
+        return loss, logging_output
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        for task_name in logging_outputs[0]["multitask"].keys():
+            # different criterion may return different logging
+            # currently only reduce on loss, the most common one
+            # ideally the way that losses are reduced should also depend on the task type
+            loss_sum = sum(
+                log["multitask"][task_name].get("loss", 0) for log in logging_outputs
+            )
+            sample_size = sum(
+                log["multitask"][task_name].get("sample_size", 0)
+                for log in logging_outputs
+            )
+
+            metrics.log_scalar(
+                f"multitask_{task_name}_loss",
+                loss_sum / sample_size / math.log(2),
+                sample_size,
+                round=3,
+            )
+
+            loss_weight = logging_outputs[0]["multitask"][task_name].get(
+                "loss_weight", 0
+            )
+            metrics.log_scalar(
+                f"multitask_{task_name}_loss_weight",
+                loss_weight,
+                weight=0,
+                priority=250,
+            )
+
+
+@register_criterion(
+    "speech_to_unit", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig
+)
+class SpeechToUnitMultitaskTaskCriterion(
+    RdropLabelSmoothedCrossEntropyCriterion, MultitaskCriterion
+):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size=0,
+        report_accuracy=False,
+        rdrop_alpha=0.0,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            label_smoothing,
+            ignore_prefix_size,
+            report_accuracy,
+            rdrop_alpha,
+        )
+        MultitaskCriterion.__init__(self, task.multitask_tasks, rdrop_alpha)
+
+    def forward(self, model, sample, reduce=True):
+        net_input_concat = {
+            "src_tokens": sample["net_input"]["src_tokens"],
+            "src_lengths": sample["net_input"]["src_lengths"],
+            "prev_output_tokens": sample["net_input"]["prev_output_tokens"],
+            "tgt_speaker": sample["net_input"].get("tgt_speaker", None),
+            "return_all_hiddens": True,
+        }
+
+        if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0:
+            net_input_concat = duplicate_input(net_input_concat)
+
+        net_output, extra = model(**net_input_concat)
+        loss, nll_loss, rdrop_kl_loss = self.compute_loss(
+            model, [net_output], sample, reduce=reduce
+        )
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, [net_output], sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        if self.rdrop_alpha > 0:
+            logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data)
+
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+
+        return loss, sample_size, logging_output
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        super().reduce_metrics(logging_outputs)
+
+        # inference metrics
+        if "targ_frames" in logging_outputs[0]:
+            n = sum(log.get("norm_frames", 0) for log in logging_outputs)
+            for key, new_key in [
+                ("mcd_loss", "mcd_loss"),
+                ("pred_frames", "pred_ratio"),
+                ("nins", "ins_rate"),
+                ("ndel", "del_rate"),
+            ]:
+                val = sum(log.get(key, 0) for log in logging_outputs)
+                metrics.log_scalar(new_key, val / n, n, round=3)
+
+        if "multitask" not in logging_outputs[0]:
+            return
+
+        MultitaskCriterion.reduce_metrics(logging_outputs)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return False
+
+
+@register_criterion(
+    "speech_to_unit_2pass", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig
+)
+class SpeechToUnit2passMultitaskTaskCriterion(SpeechToUnitMultitaskTaskCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        label_smoothing,
+        ignore_prefix_size=0,
+        report_accuracy=False,
+        rdrop_alpha=0.0,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            label_smoothing,
+            ignore_prefix_size,
+            report_accuracy,
+            rdrop_alpha,
+        )
+
+    def forward(self, model, sample, reduce=True):
+        net_input_concat = {
+            "src_tokens": sample["net_input"]["src_tokens"],
+            "src_lengths": sample["net_input"]["src_lengths"],
+            "prev_output_tokens": sample["net_input"]["prev_output_tokens"],
+            "prev_output_tokens_mt": sample["multitask"][model.mt_task_name][
+                "net_input"
+            ]["prev_output_tokens"],
+            "tgt_speaker": sample["net_input"].get("tgt_speaker", None),
+            "return_all_hiddens": True,
+        }
+        if getattr(model, "asr_task_name", None) is not None:
+            net_input_concat["prev_output_tokens_asr"] = sample["multitask"][
+                model.asr_task_name
+            ]["net_input"]["prev_output_tokens"]
+
+        if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0:
+            net_input_concat = duplicate_input(net_input_concat)
+
+        net_output, extra = model(**net_input_concat)
+        loss, nll_loss, rdrop_kl_loss = self.compute_loss(
+            model, [net_output], sample, reduce=reduce
+        )
+
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if self.report_accuracy:
+            n_correct, total = self.compute_accuracy(model, [net_output], sample)
+            logging_output["n_correct"] = utils.item(n_correct.data)
+            logging_output["total"] = utils.item(total.data)
+        if self.rdrop_alpha > 0:
+            logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data)
+
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+
+        return loss, sample_size, logging_output
+
+
+@register_criterion("speech_to_spectrogram", dataclass=Tacotron2CriterionConfig)
+class SpeechToSpectrogramMultitaskTaskCriterion(Tacotron2Criterion, MultitaskCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        use_guided_attention_loss,
+        guided_attention_loss_sigma,
+        bce_pos_weight,
+        ctc_weight,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            use_guided_attention_loss,
+            guided_attention_loss_sigma,
+            bce_pos_weight,
+            ctc_weight,
+        )
+        MultitaskCriterion.__init__(self, task.multitask_tasks)
+
+    def forward(self, model, sample, reduction="mean"):
+        bsz, max_len, _ = sample["target"].size()
+        feat_tgt = sample["target"]
+        feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len)
+        eos_tgt = torch.arange(max_len).to(sample["target"].device)
+        eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1)
+        eos_tgt = (eos_tgt == (feat_len - 1)).float()
+
+        feat_out, eos_out, extra = model(
+            src_tokens=sample["net_input"]["src_tokens"],
+            src_lengths=sample["net_input"]["src_lengths"],
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            tgt_speaker=sample["net_input"]["tgt_speaker"],
+            target_lengths=sample["target_lengths"],
+            return_all_hiddens=True,
+        )
+
+        l1_loss, mse_loss, eos_loss = self.compute_loss(
+            extra["feature_out"],
+            feat_out,
+            eos_out,
+            feat_tgt,
+            eos_tgt,
+            sample["target_lengths"],
+            reduction,
+        )
+        attn_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.guided_attn is not None:
+            attn_loss = self.guided_attn(
+                extra["attn"],
+                sample["net_input"]["src_lengths"],
+                sample["target_lengths"],
+                reduction,
+            )
+        loss = (
+            l1_loss + mse_loss + eos_loss + attn_loss
+        )  # do not include ctc loss as there's no text target
+
+        sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "mse_loss": utils.item(mse_loss.data),
+            "eos_loss": utils.item(eos_loss.data),
+            "attn_loss": utils.item(attn_loss.data),
+        }
+
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+        return loss, sample_size, logging_output
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs) -> None:
+        super().reduce_metrics(logging_outputs)
+
+        # inference metrics
+        if "targ_frames" in logging_outputs[0]:
+            n = sum(log.get("norm_frames", 0) for log in logging_outputs)
+            for key, new_key in [
+                ("mcd_loss", "mcd_loss"),
+                ("pred_frames", "pred_ratio"),
+                ("nins", "ins_rate"),
+                ("ndel", "del_rate"),
+            ]:
+                val = sum(log.get(key, 0) for log in logging_outputs)
+                metrics.log_scalar(new_key, val / n, n, round=3)
+
+        if "multitask" not in logging_outputs[0]:
+            return
+
+        MultitaskCriterion.reduce_metrics(logging_outputs)
+
+
+@register_criterion("speech_to_spectrogram_2pass", dataclass=Tacotron2CriterionConfig)
+class SpeechToSpectrogram2passMultitaskTaskCriterion(
+    SpeechToSpectrogramMultitaskTaskCriterion
+):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        use_guided_attention_loss,
+        guided_attention_loss_sigma,
+        bce_pos_weight,
+        ctc_weight,
+    ):
+        super().__init__(
+            task,
+            sentence_avg,
+            use_guided_attention_loss,
+            guided_attention_loss_sigma,
+            bce_pos_weight,
+            ctc_weight,
+        )
+
+    def forward(self, model, sample, reduction="mean"):
+        bsz, max_len, _ = sample["target"].size()
+        feat_tgt = sample["target"]
+        feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len)
+        eos_tgt = torch.arange(max_len).to(sample["target"].device)
+        eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1)
+        eos_tgt = (eos_tgt == (feat_len - 1)).float()
+
+        feat_out, eos_out, extra = model(
+            src_tokens=sample["net_input"]["src_tokens"],
+            src_lengths=sample["net_input"]["src_lengths"],
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            prev_output_tokens_mt=sample["multitask"][model.mt_task_name]["net_input"][
+                "prev_output_tokens"
+            ],
+            tgt_speaker=sample["net_input"]["tgt_speaker"],
+            target_lengths=sample["target_lengths"],
+            return_all_hiddens=True,
+        )
+
+        l1_loss, mse_loss, eos_loss = self.compute_loss(
+            extra["feature_out"],
+            feat_out,
+            eos_out,
+            feat_tgt,
+            eos_tgt,
+            sample["target_lengths"],
+            reduction,
+        )
+        attn_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.guided_attn is not None:
+            attn_loss = self.guided_attn(
+                extra["attn"],
+                sample["net_input"]["src_lengths"],
+                sample["target_lengths"],
+                reduction,
+            )
+        loss = (
+            l1_loss + mse_loss + eos_loss + attn_loss
+        )  # do not include ctc loss as there's no text target
+
+        sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "mse_loss": utils.item(mse_loss.data),
+            "eos_loss": utils.item(eos_loss.data),
+            "attn_loss": utils.item(attn_loss.data),
+        }
+
+        if len(self.multitask_criterion) == 0:
+            return loss, sample_size, logging_output
+
+        # multitask
+        multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra)
+        loss += multitask_loss
+        logging_output["multitask"] = multitask_log
+        return loss, sample_size, logging_output
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
new file mode 100644
index 0000000000..eea74bae26
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from dataclasses import dataclass, field
+
+import torch.nn.functional as F
+from fairseq.logging import metrics
+from fairseq.tasks import FairseqTask
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II
+
+
+@dataclass
+class SpeechUnitLmCriterionConfig(FairseqDataclass):
+    sentence_avg: bool = II("optimization.sentence_avg")
+    loss_weights: str = field(
+        default="1.;0.0;0.0",
+        metadata={
+            "help": "Weights of the losses that correspond to token, duration, and F0 streams"
+        },
+    )
+    discrete_duration: bool = II("task.discrete_duration")
+    discrete_f0: bool = II("task.discrete_f0")
+
+
+def mae_loss(pred, targ, mask, reduce=True):
+    if pred.ndim == 3:
+        pred = pred.squeeze(2)
+    else:
+        assert pred.ndim == 2
+    loss = (pred.float() - targ.float()).abs() * (~mask).float()
+    loss = loss.sum() if reduce else loss.view(-1)
+    return loss
+
+
+def nll_loss(pred, targ, mask, reduce=True):
+    lprob = F.log_softmax(pred, dim=-1)
+    loss = F.nll_loss(lprob.view(-1, lprob.size(-1)), targ.view(-1), reduction="none")
+    loss = loss * (~mask).float().view(-1)
+    loss = loss.sum() if reduce else loss.view(-1)
+    return loss
+
+
+@register_criterion("speech_unit_lm_criterion", dataclass=SpeechUnitLmCriterionConfig)
+class SpeechUnitLmCriterion(FairseqCriterion):
+    def __init__(self, cfg: SpeechUnitLmCriterionConfig, task: FairseqTask):
+        super().__init__(task)
+        self.sentence_avg = cfg.sentence_avg
+        self.weights = torch.tensor([float(w) for w in cfg.loss_weights.split(";")])
+        assert self.weights.size(0) == 3
+        assert (self.weights >= 0.0).all()
+
+        self.dur_loss_fn = nll_loss if cfg.discrete_duration else mae_loss
+        self.f0_loss_fn = nll_loss if cfg.discrete_f0 else mae_loss
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+
+        token_loss = nll_loss(
+            net_output["token"], sample["target"], sample["mask"], reduce
+        )
+        dur_loss = self.dur_loss_fn(
+            net_output["duration"],
+            sample["dur_target"],
+            sample["dur_mask"],
+            reduce,
+        )
+        f0_loss = self.f0_loss_fn(
+            net_output["f0"],
+            sample["f0_target"],
+            sample["f0_mask"],
+            reduce,
+        )
+        loss = self.weights.to(token_loss.device) * torch.stack(
+            [token_loss, dur_loss, f0_loss], dim=-1
+        )
+        loss = loss.sum() if reduce else loss.sum(-1)
+
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": loss.detach().sum().item(),
+            "token_loss": token_loss.detach().sum().item(),
+            "dur_loss": dur_loss.detach().sum().item(),
+            "f0_loss": f0_loss.detach().sum().item(),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        token_loss_sum = sum(log.get("token_loss", 0) for log in logging_outputs)
+        dur_loss_sum = sum(log.get("dur_loss", 0) for log in logging_outputs)
+        f0_loss_sum = sum(log.get("f0_loss", 0) for log in logging_outputs)
+
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3)
+
+        metrics.log_scalar(
+            "token_loss", token_loss_sum / sample_size, sample_size, round=3
+        )
+
+        metrics.log_scalar("dur_loss", dur_loss_sum / sample_size, sample_size, round=3)
+
+        metrics.log_scalar("f0_loss", f0_loss_sum / sample_size, sample_size, round=3)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py
new file mode 100644
index 0000000000..4113fdc548
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Any, Dict, List
+
+import torch
+import torch.nn.functional as F
+from omegaconf import II
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.data.data_utils import lengths_to_mask
+from fairseq.dataclass import FairseqDataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Tacotron2CriterionConfig(FairseqDataclass):
+    bce_pos_weight: float = field(
+        default=1.0,
+        metadata={"help": "weight of positive examples for BCE loss"},
+    )
+    use_guided_attention_loss: bool = field(
+        default=False,
+        metadata={"help": "use guided attention loss"},
+    )
+    guided_attention_loss_sigma: float = field(
+        default=0.4,
+        metadata={"help": "weight of positive examples for BCE loss"},
+    )
+    ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"})
+    sentence_avg: bool = II("optimization.sentence_avg")
+
+
+class GuidedAttentionLoss(torch.nn.Module):
+    """
+    Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
+    Networks with Guided Attention (https://arxiv.org/abs/1710.08969)
+    """
+
+    def __init__(self, sigma):
+        super().__init__()
+        self.sigma = sigma
+
+    @staticmethod
+    @lru_cache(maxsize=8)
+    def _get_weight(s_len, t_len, sigma):
+        grid_x, grid_y = torch.meshgrid(torch.arange(t_len), torch.arange(s_len))
+        grid_x = grid_x.to(s_len.device)
+        grid_y = grid_y.to(s_len.device)
+        w = (grid_y.float() / s_len - grid_x.float() / t_len) ** 2
+        return 1.0 - torch.exp(-w / (2 * (sigma**2)))
+
+    def _get_weights(self, src_lens, tgt_lens):
+        bsz, max_s_len, max_t_len = len(src_lens), max(src_lens), max(tgt_lens)
+        weights = torch.zeros((bsz, max_t_len, max_s_len))
+        for i, (s_len, t_len) in enumerate(zip(src_lens, tgt_lens)):
+            weights[i, :t_len, :s_len] = self._get_weight(s_len, t_len, self.sigma)
+        return weights
+
+    @staticmethod
+    def _get_masks(src_lens, tgt_lens):
+        in_masks = lengths_to_mask(src_lens)
+        out_masks = lengths_to_mask(tgt_lens)
+        return out_masks.unsqueeze(2) & in_masks.unsqueeze(1)
+
+    def forward(self, attn, src_lens, tgt_lens, reduction="mean"):
+        weights = self._get_weights(src_lens, tgt_lens).to(attn.device)
+        masks = self._get_masks(src_lens, tgt_lens).to(attn.device)
+        loss = (weights * attn.transpose(1, 2)).masked_select(masks)
+        loss = torch.sum(loss) if reduction == "sum" else torch.mean(loss)
+        return loss
+
+
+@register_criterion("tacotron2", dataclass=Tacotron2CriterionConfig)
+class Tacotron2Criterion(FairseqCriterion):
+    def __init__(
+        self,
+        task,
+        sentence_avg,
+        use_guided_attention_loss,
+        guided_attention_loss_sigma,
+        bce_pos_weight,
+        ctc_weight,
+    ):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        self.bce_pos_weight = bce_pos_weight
+
+        self.guided_attn = None
+        if use_guided_attention_loss:
+            self.guided_attn = GuidedAttentionLoss(guided_attention_loss_sigma)
+        self.ctc_weight = ctc_weight
+
+    def forward(self, model, sample, reduction="mean"):
+        bsz, max_len, _ = sample["target"].size()
+        feat_tgt = sample["target"]
+        feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len)
+        eos_tgt = torch.arange(max_len).to(sample["target"].device)
+        eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1)
+        eos_tgt = (eos_tgt == (feat_len - 1)).float()
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lens = sample["net_input"]["src_lengths"]
+        tgt_lens = sample["target_lengths"]
+
+        feat_out, eos_out, extra = model(
+            src_tokens=src_tokens,
+            src_lengths=src_lens,
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            incremental_state=None,
+            target_lengths=tgt_lens,
+            speaker=sample["speaker"],
+        )
+
+        l1_loss, mse_loss, eos_loss = self.compute_loss(
+            extra["feature_out"],
+            feat_out,
+            eos_out,
+            feat_tgt,
+            eos_tgt,
+            tgt_lens,
+            reduction,
+        )
+        attn_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.guided_attn is not None:
+            attn_loss = self.guided_attn(extra["attn"], src_lens, tgt_lens, reduction)
+        ctc_loss = torch.tensor(0.0).type_as(l1_loss)
+        if self.ctc_weight > 0.0:
+            net_output = (feat_out, eos_out, extra)
+            lprobs = model.get_normalized_probs(net_output, log_probs=True)
+            lprobs = lprobs.transpose(0, 1)  # T x B x C
+            src_mask = lengths_to_mask(src_lens)
+            src_tokens_flat = src_tokens.masked_select(src_mask)
+            ctc_loss = (
+                F.ctc_loss(
+                    lprobs,
+                    src_tokens_flat,
+                    tgt_lens,
+                    src_lens,
+                    reduction=reduction,
+                    zero_infinity=True,
+                )
+                * self.ctc_weight
+            )
+        loss = l1_loss + mse_loss + eos_loss + attn_loss + ctc_loss
+
+        sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data),
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "l1_loss": utils.item(l1_loss.data),
+            "mse_loss": utils.item(mse_loss.data),
+            "eos_loss": utils.item(eos_loss.data),
+            "attn_loss": utils.item(attn_loss.data),
+            "ctc_loss": utils.item(ctc_loss.data),
+        }
+        return loss, sample_size, logging_output
+
+    def compute_loss(
+        self,
+        feat_out,
+        feat_out_post,
+        eos_out,
+        feat_tgt,
+        eos_tgt,
+        tgt_lens,
+        reduction="mean",
+    ):
+        mask = lengths_to_mask(tgt_lens)
+        _eos_out = eos_out[mask].squeeze()
+        _eos_tgt = eos_tgt[mask]
+        _feat_tgt = feat_tgt[mask]
+        _feat_out = feat_out[mask]
+        _feat_out_post = feat_out_post[mask]
+
+        l1_loss = F.l1_loss(_feat_out, _feat_tgt, reduction=reduction) + F.l1_loss(
+            _feat_out_post, _feat_tgt, reduction=reduction
+        )
+        mse_loss = F.mse_loss(_feat_out, _feat_tgt, reduction=reduction) + F.mse_loss(
+            _feat_out_post, _feat_tgt, reduction=reduction
+        )
+        eos_loss = F.binary_cross_entropy_with_logits(
+            _eos_out,
+            _eos_tgt,
+            pos_weight=torch.tensor(self.bce_pos_weight),
+            reduction=reduction,
+        )
+        return l1_loss, mse_loss, eos_loss
+
+    @classmethod
+    def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
+        ns = [log.get("sample_size", 0) for log in logging_outputs]
+        ntot = sum(ns)
+        ws = [n / (ntot + 1e-8) for n in ns]
+        for key in ["loss", "l1_loss", "mse_loss", "eos_loss", "attn_loss", "ctc_loss"]:
+            vals = [log.get(key, 0) for log in logging_outputs]
+            val = sum(val * w for val, w in zip(vals, ws))
+            metrics.log_scalar(key, val, ntot, round=3)
+        metrics.log_scalar("sample_size", ntot, len(logging_outputs))
+
+        # inference metrics
+        if "targ_frames" not in logging_outputs[0]:
+            return
+        n = sum(log.get("targ_frames", 0) for log in logging_outputs)
+        for key, new_key in [
+            ("mcd_loss", "mcd_loss"),
+            ("pred_frames", "pred_ratio"),
+            ("nins", "ins_rate"),
+            ("ndel", "del_rate"),
+        ]:
+            val = sum(log.get(key, 0) for log in logging_outputs)
+            metrics.log_scalar(new_key, val / n, n, round=3)
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
new file mode 100644
index 0000000000..3975468487
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
@@ -0,0 +1,231 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.logging.meters import safe_round
+from fairseq.utils import is_xla_tensor
+
+
+@dataclass
+class Wav2VecCriterionConfig(FairseqDataclass):
+    infonce: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)"
+        },
+    )
+    loss_weights: Optional[List[float]] = field(
+        default=None,
+        metadata={"help": "weights for additional loss terms (not first one)"},
+    )
+    log_keys: List[str] = field(
+        default_factory=lambda: [],
+        metadata={"help": "output keys to log"},
+    )
+
+
+@register_criterion("wav2vec", dataclass=Wav2VecCriterionConfig)
+class Wav2vecCriterion(FairseqCriterion):
+    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None):
+        super().__init__(task)
+        self.infonce = infonce
+        self.loss_weights = loss_weights
+        self.log_keys = [] if log_keys is None else log_keys
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        logits = model.get_logits(net_output).float()
+        target = model.get_targets(sample, net_output)
+        self.xla = is_xla_tensor(logits)
+
+        # XXX: handle weights on xla.
+        weights = None
+        if hasattr(model, "get_target_weights") and not self.infonce:
+            weights = model.get_target_weights(target, net_output)
+            if torch.is_tensor(weights):
+                weights = weights.float()
+
+        losses = []
+
+        reduction = "none" if ((not reduce) or self.xla) else "sum"
+        if self.infonce:
+            loss = F.cross_entropy(logits, target, reduction=reduction)
+        else:
+            loss = F.binary_cross_entropy_with_logits(
+                logits, target.float(), weights, reduction=reduction
+            )
+
+        if self.xla:
+            # tpu-comment: since dynamic shapes lead to recompilations on xla,
+            # we don't shrink tensors using mask_indices.
+            # Instead, we use mask indices to adjust loss.
+            mi = (
+                sample["net_input"]["mask_indices"]
+                .transpose(0, 1)  # logits are transposed in `model.get_logits`
+                .reshape(logits.size(0))
+            )
+            loss = (loss * mi).sum() if reduce else (loss * mi)
+
+        if "sample_size" in sample:
+            sample_size = sample["sample_size"]
+        elif "mask_indices" in sample["net_input"]:
+            sample_size = sample["net_input"]["mask_indices"].sum()
+        else:
+            sample_size = target.numel() if self.infonce else target.long().sum().item()
+        losses.append(loss.detach().clone())
+
+        if self.loss_weights is not None:
+            assert hasattr(model, "get_extra_losses")
+            extra_losses = model.get_extra_losses(net_output)
+            if torch.is_tensor(extra_losses):
+                extra_losses = [extra_losses]
+            if len(self.loss_weights) == 1 and len(extra_losses) != 1:
+                self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
+            assert len(extra_losses) == len(
+                self.loss_weights
+            ), f"{len(extra_losses)}, {len(self.loss_weights)}"
+            for p, coef in zip(extra_losses, self.loss_weights):
+                if coef != 0 and p is not None:
+                    p = coef * p.float() * sample_size
+                    loss += p
+                    losses.append(p)
+
+        logging_output = {
+            "loss": loss.item() if (reduce and not self.xla) else loss.detach(),
+            "ntokens": sample_size,
+            "nsentences": sample["id"].numel(),
+            "sample_size": sample_size,
+        }
+
+        for lk in self.log_keys:
+            # Only store "logits" and "target" for computing MAP and MAUC
+            # during validation
+            if lk == "logits":
+                if not self.training:
+                    logging_output["logits"] = logits.cpu().numpy()
+            elif lk == "target":
+                if not self.training:
+                    # If the targets have been mixed with the predictions of
+                    # teacher models, find the original targets
+                    if hasattr(model, "get_original_targets"):
+                        original_target = model.get_original_targets(sample, net_output)
+                    else:
+                        original_target = target
+                    logging_output["target"] = original_target.cpu().numpy()
+            elif lk in net_output:
+                value = net_output[lk]
+                if not is_xla_tensor(value):
+                    value = float(value)
+                logging_output[lk] = value
+
+        if len(losses) > 1:
+            for i, l in enumerate(losses):
+                logging_output[f"loss_{i}"] = l.item() if not self.xla else l.detach()
+
+        if self.infonce:
+            with torch.no_grad():
+                if logits.numel() == 0:
+                    corr = 0
+                    count = 0
+                else:
+                    assert logits.dim() > 1, logits.shape
+                    max = logits.argmax(-1) == 0
+                    min = logits.argmin(-1) == 0
+                    if is_xla_tensor(logits):
+                        max, min = max * mi, min * mi
+                        both = max & min
+                        corr = max.long().sum() - both.long().sum()
+                        count = mi.sum()
+                    else:
+                        both = max & min
+                        corr = max.long().sum().item() - both.long().sum().item()
+                        count = float(max.numel())
+
+                logging_output["correct"] = corr
+                logging_output["count"] = count
+
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+        nsentences = utils.item(
+            sum(log.get("nsentences", 0) for log in logging_outputs)
+        )
+        sample_size = utils.item(
+            sum(log.get("sample_size", 0) for log in logging_outputs)
+        )
+
+        metrics.log_scalar(
+            "loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar("ntokens", ntokens)
+        metrics.log_scalar("nsentences", nsentences)
+
+        correct = sum(log.get("correct", 0) for log in logging_outputs)
+        metrics.log_scalar("_correct", correct)
+
+        total = sum(log.get("count", 0) for log in logging_outputs)
+        metrics.log_scalar("_total", total)
+
+        if total > 0:
+            metrics.log_derived(
+                "accuracy",
+                lambda meters: safe_round(
+                    meters["_correct"].sum / meters["_total"].sum, 5
+                )
+                if meters["_total"].sum > 0
+                else float("nan"),
+            )
+
+        builtin_keys = {
+            "loss",
+            "ntokens",
+            "nsentences",
+            "sample_size",
+            "correct",
+            "count",
+        }
+
+        for k in logging_outputs[0]:
+            if k not in builtin_keys:
+                val = sum(log.get(k, 0) for log in logging_outputs)
+                if k.startswith("loss"):
+                    metrics.log_scalar(
+                        k, val / (sample_size or 1) / math.log(2), sample_size, round=3
+                    )
+                else:
+                    metrics.log_scalar(k, val / len(logging_outputs), round=3)
+
+    # FIXME: revert when gather based xla reduction is implemented
+    # @staticmethod
+    # def logging_outputs_can_be_summed() -> bool:
+    def logging_outputs_can_be_summed(self) -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        # XXX: Gather based reduction not implemented for xla yet.
+        # So we fall to sum based reduction for xla.
+        return self.xla
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
new file mode 100644
index 0000000000..eeaae2b254
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
@@ -0,0 +1,137 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+from .dictionary import Dictionary, TruncatedDictionary
+
+from .fairseq_dataset import FairseqDataset, FairseqIterableDataset
+
+from .base_wrapper_dataset import BaseWrapperDataset
+
+from .add_target_dataset import AddTargetDataset
+from .append_token_dataset import AppendTokenDataset
+from .audio.raw_audio_dataset import BinarizedAudioDataset, FileAudioDataset
+from .audio.hubert_dataset import HubertDataset
+from .backtranslation_dataset import BacktranslationDataset
+from .bucket_pad_length_dataset import BucketPadLengthDataset
+from .colorize_dataset import ColorizeDataset
+from .concat_dataset import ConcatDataset
+from .concat_sentences_dataset import ConcatSentencesDataset
+from .denoising_dataset import DenoisingDataset
+from .id_dataset import IdDataset
+from .indexed_dataset import (
+    IndexedCachedDataset,
+    IndexedDataset,
+    IndexedRawTextDataset,
+    MMapIndexedDataset,
+)
+from .language_pair_dataset import LanguagePairDataset
+from .list_dataset import ListDataset
+from .lm_context_window_dataset import LMContextWindowDataset
+from .lru_cache_dataset import LRUCacheDataset
+from .mask_tokens_dataset import MaskTokensDataset
+from .monolingual_dataset import MonolingualDataset
+from .multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from .nested_dictionary_dataset import NestedDictionaryDataset
+from .noising import NoisingDataset
+from .numel_dataset import NumelDataset
+from .num_samples_dataset import NumSamplesDataset
+from .offset_tokens_dataset import OffsetTokensDataset
+from .padding_mask_dataset import (
+    LeftPaddingMaskDataset,
+    PaddingMaskDataset,
+    RightPaddingMaskDataset,
+)
+from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
+from .prepend_dataset import PrependDataset
+from .prepend_token_dataset import PrependTokenDataset
+from .raw_label_dataset import RawLabelDataset
+from .replace_dataset import ReplaceDataset
+from .resampling_dataset import ResamplingDataset
+from .roll_dataset import RollDataset
+from .round_robin_zip_datasets import RoundRobinZipDatasets
+from .sort_dataset import SortDataset
+from .speech_dlm_dataset import SpeechDLMDataset
+from .strip_token_dataset import StripTokenDataset
+from .subsample_dataset import SubsampleDataset
+from .token_block_dataset import TokenBlockDataset
+from .transform_eos_dataset import TransformEosDataset
+from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
+from .shorten_dataset import TruncateDataset, RandomCropDataset
+from .multilingual.sampled_multi_dataset import SampledMultiDataset
+from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset
+from .fasta_dataset import FastaDataset, EncodedFastaDataset
+from .transform_eos_concat_langpair_dataset import TransformEosConcatLangPairDataset
+
+from .iterators import (
+    CountingIterator,
+    EpochBatchIterator,
+    GroupedIterator,
+    ShardedIterator,
+)
+
+__all__ = [
+    "AddTargetDataset",
+    "AppendTokenDataset",
+    "BacktranslationDataset",
+    "BaseWrapperDataset",
+    "BinarizedAudioDataset",
+    "BucketPadLengthDataset",
+    "ColorizeDataset",
+    "ConcatDataset",
+    "ConcatSentencesDataset",
+    "CountingIterator",
+    "DenoisingDataset",
+    "Dictionary",
+    "EncodedFastaDataset",
+    "EpochBatchIterator",
+    "FairseqDataset",
+    "FairseqIterableDataset",
+    "FastaDataset",
+    "FileAudioDataset",
+    "GroupedIterator",
+    "HubertDataset",
+    "IdDataset",
+    "IndexedCachedDataset",
+    "IndexedDataset",
+    "IndexedRawTextDataset",
+    "LanguagePairDataset",
+    "LeftPadDataset",
+    "ListDataset",
+    "LMContextWindowDataset",
+    "LRUCacheDataset",
+    "MaskTokensDataset",
+    "MMapIndexedDataset",
+    "MonolingualDataset",
+    "MultiCorpusSampledDataset",
+    "NestedDictionaryDataset",
+    "NoisingDataset",
+    "NumelDataset",
+    "NumSamplesDataset",
+    "OffsetTokensDataset",
+    "PadDataset",
+    "PrependDataset",
+    "PrependTokenDataset",
+    "RandomCropDataset",
+    "RawLabelDataset",
+    "ResamplingDataset",
+    "ReplaceDataset",
+    "RightPadDataset",
+    "RollDataset",
+    "RoundRobinZipDatasets",
+    "SampledMultiDataset",
+    "SampledMultiEpochDataset",
+    "ShardedIterator",
+    "SortDataset",
+    "SpeechDLMDataset",
+    "StripTokenDataset",
+    "SubsampleDataset",
+    "TokenBlockDataset",
+    "TransformEosDataset",
+    "TransformEosLangPairDataset",
+    "TransformEosConcatLangPairDataset",
+    "TruncateDataset",
+    "TruncatedDictionary",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
new file mode 100644
index 0000000000..bf89f25656
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
@@ -0,0 +1,79 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset, data_utils
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+
+
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        labels,
+        pad,
+        eos,
+        batch_targets,
+        process_label=None,
+        label_len_fn=None,
+        add_to_input=False,
+        text_compression_level=TextCompressionLevel.none,
+    ):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.label_len_fn = label_len_fn
+        self.add_to_input = add_to_input
+        self.text_compressor = TextCompressor(level=text_compression_level)
+
+    def get_label(self, index, process_fn=None):
+        lbl = self.labels[index]
+        lbl = self.text_compressor.decompress(lbl)
+        return lbl if process_fn is None else process_fn(lbl)
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index, process_fn=self.process_label)
+        return item
+
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = self.label_len_fn(self.get_label(index))
+        return sz, own_sz
+
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+
+        collated["target"] = target
+
+        if self.add_to_input:
+            eos = target.new_full((target.size(0), 1), self.eos)
+            collated["target"] = torch.cat([target, eos], dim=-1).long()
+            collated["net_input"]["prev_output_tokens"] = torch.cat(
+                [eos, target], dim=-1
+            ).long()
+            collated["ntokens"] += target.size(0)
+        return collated
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        indices, ignored = data_utils._filter_by_size_dynamic(
+            indices, self.size, max_sizes
+        )
+        return indices, ignored
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
new file mode 100644
index 0000000000..978a5b1903
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset, data_utils
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+
+
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        labels,
+        pad,
+        eos,
+        batch_targets,
+        process_label=None,
+        label_len_fn=None,
+        add_to_input=False,
+        text_compression_level=TextCompressionLevel.none,
+    ):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.label_len_fn = label_len_fn
+        self.add_to_input = add_to_input
+        self.text_compressor = TextCompressor(level=text_compression_level)
+
+    def get_label(self, index, process_fn=None):
+        lbl = self.labels[index]
+        lbl = self.text_compressor.decompress(lbl)
+        return lbl if process_fn is None else process_fn(lbl)
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index, process_fn=self.process_label)
+        return item
+
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = self.label_len_fn(self.get_label(index))
+        return sz, own_sz
+
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+
+        if self.add_to_input:
+            eos = torch.LongTensor([self.eos])
+            prev_output_tokens = [torch.cat([eos, t], axis=-1) for t in target]
+            target = [torch.cat([t, eos], axis=-1) for t in target]
+            collated["net_input"]["prev_output_tokens"] = prev_output_tokens
+
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+            if getattr(collated["net_input"], "prev_output_tokens", None):
+                collated["net_input"]["prev_output_tokens"] = data_utils.collate_tokens(
+                    collated["net_input"]["prev_output_tokens"],
+                    pad_idx=self.pad,
+                    left_pad=False,
+                )
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+
+        collated["target"] = target
+        return collated
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        indices, ignored = data_utils._filter_by_size_dynamic(
+            indices, self.size, max_sizes
+        )
+        return indices, ignored
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
new file mode 100644
index 0000000000..87695bd0f5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class AppendTokenDataset(BaseWrapperDataset):
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item, item.new([self.token])])
+        return item
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py
new file mode 100644
index 0000000000..dff90fadfc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py
@@ -0,0 +1,93 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+import importlib
+import os
+import numpy as np
+
+
+class AudioTransform(ABC):
+    @classmethod
+    @abstractmethod
+    def from_config_dict(cls, config: Optional[Dict] = None):
+        pass
+
+
+class CompositeAudioTransform(AudioTransform):
+    def _from_config_dict(
+        cls,
+        transform_type,
+        get_audio_transform,
+        composite_cls,
+        config=None,
+        return_empty=False,
+    ):
+        _config = {} if config is None else config
+        _transforms = _config.get(f"{transform_type}_transforms")
+
+        if _transforms is None:
+            if return_empty:
+                _transforms = []
+            else:
+                return None
+
+        transforms = [
+            get_audio_transform(_t).from_config_dict(_config.get(_t))
+            for _t in _transforms
+        ]
+        return composite_cls(transforms)
+
+    def __init__(self, transforms):
+        self.transforms = [t for t in transforms if t is not None]
+
+    def __call__(self, x):
+        for t in self.transforms:
+            x = t(x)
+        return x
+
+    def __repr__(self):
+        format_string = (
+            [self.__class__.__name__ + "("]
+            + [f"    {t.__repr__()}" for t in self.transforms]
+            + [")"]
+        )
+        return "\n".join(format_string)
+
+
+def register_audio_transform(name, cls_type, registry, class_names):
+    def register_audio_transform_cls(cls):
+        if name in registry:
+            raise ValueError(f"Cannot register duplicate transform ({name})")
+        if not issubclass(cls, cls_type):
+            raise ValueError(
+                f"Transform ({name}: {cls.__name__}) must extend "
+                f"{cls_type.__name__}"
+            )
+        if cls.__name__ in class_names:
+            raise ValueError(
+                f"Cannot register audio transform with duplicate "
+                f"class name ({cls.__name__})"
+            )
+        registry[name] = cls
+        class_names.add(cls.__name__)
+        return cls
+
+    return register_audio_transform_cls
+
+
+def import_transforms(transforms_dir, transform_type):
+    for file in os.listdir(transforms_dir):
+        path = os.path.join(transforms_dir, file)
+        if (
+            not file.startswith("_")
+            and not file.startswith(".")
+            and (file.endswith(".py") or os.path.isdir(path))
+        ):
+            name = file[: file.find(".py")] if file.endswith(".py") else file
+            importlib.import_module(
+                f"fairseq.data.audio.{transform_type}_transforms." + name
+            )
+
+
+# Utility fn for uniform numbers in transforms
+def rand_uniform(a, b):
+    return np.random.uniform() * (b - a) + a
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
new file mode 100644
index 0000000000..590a7493ae
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
@@ -0,0 +1,389 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import mmap
+from pathlib import Path
+import io
+from typing import BinaryIO, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform
+
+SF_AUDIO_FILE_EXTENSIONS = {".wav", ".flac", ".ogg"}
+FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS = {".npy", ".wav", ".flac", ".ogg"}
+
+
+def convert_waveform(
+    waveform: Union[np.ndarray, torch.Tensor],
+    sample_rate: int,
+    normalize_volume: bool = False,
+    to_mono: bool = False,
+    to_sample_rate: Optional[int] = None,
+) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
+    """convert a waveform:
+    - to a target sample rate
+    - from multi-channel to mono channel
+    - volume normalization
+
+    Args:
+        waveform (numpy.ndarray or torch.Tensor): 2D original waveform
+            (channels x length)
+        sample_rate (int): original sample rate
+        normalize_volume (bool): perform volume normalization
+        to_mono (bool): convert to mono channel if having multiple channels
+        to_sample_rate (Optional[int]): target sample rate
+    Returns:
+        waveform (numpy.ndarray): converted 2D waveform (channels x length)
+        sample_rate (float): target sample rate
+    """
+    try:
+        import torchaudio.sox_effects as ta_sox
+    except ImportError:
+        raise ImportError("Please install torchaudio: pip install torchaudio")
+
+    effects = []
+    if normalize_volume:
+        effects.append(["gain", "-n"])
+    if to_sample_rate is not None and to_sample_rate != sample_rate:
+        effects.append(["rate", f"{to_sample_rate}"])
+    if to_mono and waveform.shape[0] > 1:
+        effects.append(["channels", "1"])
+    if len(effects) > 0:
+        is_np_input = isinstance(waveform, np.ndarray)
+        _waveform = torch.from_numpy(waveform) if is_np_input else waveform
+        converted, converted_sample_rate = ta_sox.apply_effects_tensor(
+            _waveform, sample_rate, effects
+        )
+        if is_np_input:
+            converted = converted.numpy()
+        return converted, converted_sample_rate
+    return waveform, sample_rate
+
+
+def get_waveform(
+    path_or_fp: Union[str, BinaryIO],
+    normalization: bool = True,
+    mono: bool = True,
+    frames: int = -1,
+    start: int = 0,
+    always_2d: bool = True,
+    output_sample_rate: Optional[int] = None,
+    normalize_volume: bool = False,
+    waveform_transforms: Optional[CompositeAudioWaveformTransform] = None,
+) -> Tuple[np.ndarray, int]:
+    """Get the waveform and sample rate of a 16-bit WAV/FLAC/OGG Vorbis audio.
+
+    Args:
+        path_or_fp (str or BinaryIO): the path or file-like object
+        normalization (bool): normalize values to [-1, 1] (Default: True)
+        mono (bool): convert multi-channel audio to mono-channel one
+        frames (int): the number of frames to read. (-1 for reading all)
+        start (int): Where to start reading. A negative value counts from the end.
+        always_2d (bool): always return 2D array even for mono-channel audios
+        output_sample_rate (Optional[int]): output sample rate
+        normalize_volume (bool): normalize volume
+    Returns:
+        waveform (numpy.ndarray): 1D or 2D waveform (channels x length)
+        sample_rate (float): sample rate
+    """
+    if isinstance(path_or_fp, str):
+        ext = Path(path_or_fp).suffix
+        if ext not in SF_AUDIO_FILE_EXTENSIONS:
+            raise ValueError(f"Unsupported audio format: {ext}")
+
+    try:
+        import soundfile as sf
+    except ImportError:
+        raise ImportError("Please install soundfile: pip install soundfile")
+
+    waveform, sample_rate = sf.read(
+        path_or_fp, dtype="float32", always_2d=True, frames=frames, start=start
+    )
+    waveform = waveform.T  # T x C -> C x T
+    waveform, sample_rate = convert_waveform(
+        waveform,
+        sample_rate,
+        normalize_volume=normalize_volume,
+        to_mono=mono,
+        to_sample_rate=output_sample_rate,
+    )
+
+    if not normalization:
+        waveform *= 2**15  # denormalized to 16-bit signed integers
+
+    if waveform_transforms is not None:
+        waveform, sample_rate = waveform_transforms(waveform, sample_rate)
+
+    if not always_2d:
+        waveform = waveform.squeeze(axis=0)
+
+    return waveform, sample_rate
+
+
+def get_features_from_npy_or_audio(path, waveform_transforms=None):
+    ext = Path(path).suffix
+    if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS:
+        raise ValueError(f'Unsupported file format for "{path}"')
+    return (
+        np.load(path)
+        if ext == ".npy"
+        else get_fbank(path, waveform_transforms=waveform_transforms)
+    )
+
+
+def get_features_or_waveform_from_stored_zip(
+    path,
+    byte_offset,
+    byte_size,
+    need_waveform=False,
+    use_sample_rate=None,
+    waveform_transforms=None,
+):
+    assert path.endswith(".zip")
+    data = read_from_stored_zip(path, byte_offset, byte_size)
+    f = io.BytesIO(data)
+    if is_npy_data(data):
+        features_or_waveform = np.load(f)
+    elif is_sf_audio_data(data):
+        features_or_waveform = (
+            get_waveform(
+                f,
+                always_2d=False,
+                output_sample_rate=use_sample_rate,
+                waveform_transforms=waveform_transforms,
+            )[0]
+            if need_waveform
+            else get_fbank(f, waveform_transforms=waveform_transforms)
+        )
+    else:
+        raise ValueError(f'Unknown file format for "{path}"')
+    return features_or_waveform
+
+
+def get_features_or_waveform(
+    path: str, need_waveform=False, use_sample_rate=None, waveform_transforms=None
+):
+    """Get speech features from .npy file or waveform from .wav/.flac file.
+    The file may be inside an uncompressed ZIP file and is accessed via byte
+    offset and length.
+
+    Args:
+        path (str): File path in the format of "<.npy/.wav/.flac path>" or
+        "<zip path>:<byte offset>:<byte length>".
+        need_waveform (bool): return waveform instead of features.
+        use_sample_rate (int): change sample rate for the input wave file
+
+    Returns:
+        features_or_waveform (numpy.ndarray): speech features or waveform.
+    """
+    _path, slice_ptr = parse_path(path)
+    if len(slice_ptr) == 0:
+        if need_waveform:
+            return get_waveform(
+                _path,
+                always_2d=False,
+                output_sample_rate=use_sample_rate,
+                waveform_transforms=waveform_transforms,
+            )[0]
+        return get_features_from_npy_or_audio(
+            _path, waveform_transforms=waveform_transforms
+        )
+    elif len(slice_ptr) == 2:
+        features_or_waveform = get_features_or_waveform_from_stored_zip(
+            _path,
+            slice_ptr[0],
+            slice_ptr[1],
+            need_waveform=need_waveform,
+            use_sample_rate=use_sample_rate,
+            waveform_transforms=waveform_transforms,
+        )
+    else:
+        raise ValueError(f"Invalid path: {path}")
+
+    return features_or_waveform
+
+
+def _get_kaldi_fbank(
+    waveform: np.ndarray, sample_rate: int, n_bins=80
+) -> Optional[np.ndarray]:
+    """Get mel-filter bank features via PyKaldi."""
+    try:
+        from kaldi.feat.fbank import Fbank, FbankOptions
+        from kaldi.feat.mel import MelBanksOptions
+        from kaldi.feat.window import FrameExtractionOptions
+        from kaldi.matrix import Vector
+
+        mel_opts = MelBanksOptions()
+        mel_opts.num_bins = n_bins
+        frame_opts = FrameExtractionOptions()
+        frame_opts.samp_freq = sample_rate
+        opts = FbankOptions()
+        opts.mel_opts = mel_opts
+        opts.frame_opts = frame_opts
+        fbank = Fbank(opts=opts)
+        features = fbank.compute(Vector(waveform.squeeze()), 1.0).numpy()
+        return features
+    except ImportError:
+        return None
+
+
+def _get_torchaudio_fbank(
+    waveform: np.ndarray, sample_rate, n_bins=80
+) -> Optional[np.ndarray]:
+    """Get mel-filter bank features via TorchAudio."""
+    try:
+        import torchaudio.compliance.kaldi as ta_kaldi
+
+        waveform = torch.from_numpy(waveform)
+        features = ta_kaldi.fbank(
+            waveform, num_mel_bins=n_bins, sample_frequency=sample_rate
+        )
+        return features.numpy()
+    except ImportError:
+        return None
+
+
+def get_fbank(
+    path_or_fp: Union[str, BinaryIO], n_bins=80, waveform_transforms=None
+) -> np.ndarray:
+    """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi
+    (faster CPP implementation) to TorchAudio (Python implementation). Note that
+    Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the
+    waveform should not be normalized."""
+    waveform, sample_rate = get_waveform(
+        path_or_fp, normalization=False, waveform_transforms=waveform_transforms
+    )
+
+    features = _get_kaldi_fbank(waveform, sample_rate, n_bins)
+    if features is None:
+        features = _get_torchaudio_fbank(waveform, sample_rate, n_bins)
+    if features is None:
+        raise ImportError(
+            "Please install pyKaldi or torchaudio to enable "
+            "online filterbank feature extraction"
+        )
+
+    return features
+
+
+def is_npy_data(data: bytes) -> bool:
+    return data[0] == 147 and data[1] == 78
+
+
+def is_sf_audio_data(data: bytes) -> bool:
+    is_wav = data[0] == 82 and data[1] == 73 and data[2] == 70
+    is_flac = data[0] == 102 and data[1] == 76 and data[2] == 97
+    is_ogg = data[0] == 79 and data[1] == 103 and data[2] == 103
+    return is_wav or is_flac or is_ogg
+
+
+def mmap_read(path: str, offset: int, length: int) -> bytes:
+    with open(path, "rb") as f:
+        with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_o:
+            data = mmap_o[offset : offset + length]
+    return data
+
+
+def read_from_stored_zip(zip_path: str, offset: int, length: int) -> bytes:
+    return mmap_read(zip_path, offset, length)
+
+
+def parse_path(path: str) -> Tuple[str, List[int]]:
+    """Parse data path which is either a path to
+    1. a .npy/.wav/.flac/.ogg file
+    2. a stored ZIP file with slicing info: "[zip_path]:[offset]:[length]"
+
+      Args:
+          path (str): the data path to parse
+
+      Returns:
+          file_path (str): the file path
+          slice_ptr (list of int): empty in case 1;
+            byte offset and length for the slice in case 2
+    """
+
+    if Path(path).suffix in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS:
+        _path, slice_ptr = path, []
+    else:
+        _path, *slice_ptr = path.split(":")
+        if not Path(_path).is_file():
+            raise FileNotFoundError(f"File not found: {_path}")
+    assert len(slice_ptr) in {0, 2}, f"Invalid path: {path}"
+    slice_ptr = [int(i) for i in slice_ptr]
+    return _path, slice_ptr
+
+
+def get_window(window_fn: callable, n_fft: int, win_length: int) -> torch.Tensor:
+    padding = n_fft - win_length
+    assert padding >= 0
+    return F.pad(window_fn(win_length), (padding // 2, padding - padding // 2))
+
+
+def get_fourier_basis(n_fft: int) -> torch.Tensor:
+    basis = np.fft.fft(np.eye(n_fft))
+    basis = np.vstack(
+        [np.real(basis[: n_fft // 2 + 1, :]), np.imag(basis[: n_fft // 2 + 1, :])]
+    )
+    return torch.from_numpy(basis).float()
+
+
+def get_mel_filters(
+    sample_rate: int, n_fft: int, n_mels: int, f_min: float, f_max: float
+) -> torch.Tensor:
+    try:
+        import librosa
+    except ImportError:
+        raise ImportError("Please install librosa: pip install librosa")
+    basis = librosa.filters.mel(sample_rate, n_fft, n_mels, f_min, f_max)
+    return torch.from_numpy(basis).float()
+
+
+class TTSSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        n_fft: int,
+        win_length: int,
+        hop_length: int,
+        window_fn: callable = torch.hann_window,
+        return_phase: bool = False,
+    ) -> None:
+        super(TTSSpectrogram, self).__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.return_phase = return_phase
+
+        basis = get_fourier_basis(n_fft).unsqueeze(1)
+        basis *= get_window(window_fn, n_fft, win_length)
+        self.register_buffer("basis", basis)
+
+    def forward(
+        self, waveform: torch.Tensor
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        padding = (self.n_fft // 2, self.n_fft // 2)
+        x = F.pad(waveform.unsqueeze(1), padding, mode="reflect")
+        x = F.conv1d(x, self.basis, stride=self.hop_length)
+        real_part = x[:, : self.n_fft // 2 + 1, :]
+        imag_part = x[:, self.n_fft // 2 + 1 :, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        if self.return_phase:
+            phase = torch.atan2(imag_part, real_part)
+            return magnitude, phase
+        return magnitude
+
+
+class TTSMelScale(torch.nn.Module):
+    def __init__(
+        self, n_mels: int, sample_rate: int, f_min: float, f_max: float, n_stft: int
+    ) -> None:
+        super(TTSMelScale, self).__init__()
+        basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max)
+        self.register_buffer("basis", basis)
+
+    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
+        return torch.matmul(self.basis, specgram)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
new file mode 100644
index 0000000000..6be6f6521c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
@@ -0,0 +1,387 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from argparse import Namespace
+from copy import deepcopy
+from pathlib import Path
+from typing import Dict, Optional
+
+from fairseq.data import Dictionary
+
+logger = logging.getLogger(__name__)
+
+
+def get_config_from_yaml(yaml_path: Path):
+    try:
+        import yaml
+    except ImportError:
+        print("Please install PyYAML: pip install PyYAML")
+    config = {}
+    if yaml_path.is_file():
+        try:
+            with open(yaml_path) as f:
+                config = yaml.load(f, Loader=yaml.FullLoader)
+        except Exception as e:
+            raise Exception(f"Failed to load config from {yaml_path.as_posix()}: {e}")
+    else:
+        raise FileNotFoundError(f"{yaml_path.as_posix()} not found")
+
+    return config
+
+
+class S2TDataConfig(object):
+    """Wrapper class for data config YAML"""
+
+    def __init__(self, yaml_path: Path):
+        self.config = get_config_from_yaml(yaml_path)
+        self.root = yaml_path.parent
+
+    def _auto_convert_to_abs_path(self, x):
+        if isinstance(x, str):
+            if not Path(x).exists() and (self.root / x).exists():
+                return (self.root / x).as_posix()
+        elif isinstance(x, dict):
+            return {k: self._auto_convert_to_abs_path(v) for k, v in x.items()}
+        return x
+
+    @property
+    def vocab_filename(self):
+        """fairseq vocabulary file under data root"""
+        return self.config.get("vocab_filename", "dict.txt")
+
+    @property
+    def speaker_set_filename(self):
+        """speaker set file under data root"""
+        return self.config.get("speaker_set_filename", None)
+
+    @property
+    def shuffle(self) -> bool:
+        """Shuffle dataset samples before batching"""
+        return self.config.get("shuffle", False)
+
+    @property
+    def pre_tokenizer(self) -> Dict:
+        """Pre-tokenizer to apply before subword tokenization. Returning
+        a dictionary with `tokenizer` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        tokenizer = self.config.get("pre_tokenizer", {"tokenizer": None})
+        return self._auto_convert_to_abs_path(tokenizer)
+
+    @property
+    def bpe_tokenizer(self) -> Dict:
+        """Subword tokenizer to apply after pre-tokenization. Returning
+        a dictionary with `bpe` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        tokenizer = self.config.get("bpe_tokenizer", {"bpe": None})
+        return self._auto_convert_to_abs_path(tokenizer)
+
+    @property
+    def prepend_tgt_lang_tag(self) -> bool:
+        """Prepend target lang ID token as the target BOS (e.g. for to-many
+        multilingual setting). During inference, this requires `--prefix-size 1`
+        to force BOS to be lang ID token."""
+        return self.config.get("prepend_tgt_lang_tag", False)
+
+    @property
+    def prepend_bos_and_append_tgt_lang_tag(self) -> bool:
+        """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining)."""
+        return self.config.get("prepend_bos_and_append_tgt_lang_tag", False)
+
+    @property
+    def input_feat_per_channel(self):
+        """The dimension of input features (per audio channel)"""
+        return self.config.get("input_feat_per_channel", 80)
+
+    @property
+    def input_channels(self):
+        """The number of channels in the input audio"""
+        return self.config.get("input_channels", 1)
+
+    @property
+    def sample_rate(self):
+        return self.config.get("sample_rate", 16_000)
+
+    @property
+    def sampling_alpha(self):
+        """Hyper-parameter alpha = 1/T for temperature-based resampling.
+        (alpha = 1 for no resampling)"""
+        return self.config.get("sampling_alpha", 1.0)
+
+    @property
+    def use_audio_input(self):
+        """Needed by the dataset loader to see if the model requires
+        raw audio as inputs."""
+        return self.config.get("use_audio_input", False)
+
+    def standardize_audio(self) -> bool:
+        return self.use_audio_input and self.config.get("standardize_audio", False)
+
+    @property
+    def use_sample_rate(self):
+        """Needed by the dataset loader to see if the model requires
+        raw audio with specific sample rate as inputs."""
+        return self.config.get("use_sample_rate", 16000)
+
+    @property
+    def audio_root(self):
+        """Audio paths in the manifest TSV can be relative and this provides
+        the root path. Set this to empty string when using absolute paths."""
+        return self.config.get("audio_root", "")
+
+    def get_transforms(self, transform_type, split, is_train):
+        """Split-specific feature transforms. Allowing train set
+        wildcard `_train`, evaluation set wildcard `_eval` and general
+        wildcard `*` for matching."""
+        from copy import deepcopy
+
+        cfg = deepcopy(self.config)
+        _cur = cfg.get(f"{transform_type}transforms", {})
+        cur = _cur.get(split)
+        cur = _cur.get("_train") if cur is None and is_train else cur
+        cur = _cur.get("_eval") if cur is None and not is_train else cur
+        cur = _cur.get("*") if cur is None else cur
+        return cur
+
+    def get_feature_transforms(self, split, is_train):
+        cfg = deepcopy(self.config)
+        # TODO: deprecate transforms
+        cur = self.get_transforms("", split, is_train)
+        if cur is not None:
+            logger.warning(
+                "Auto converting transforms into feature_transforms, "
+                "but transforms will be deprecated in the future. Please "
+                "update this in the config."
+            )
+            ft_transforms = self.get_transforms("feature_", split, is_train)
+            if ft_transforms:
+                cur.extend(ft_transforms)
+        else:
+            cur = self.get_transforms("feature_", split, is_train)
+        cfg["feature_transforms"] = cur
+        return cfg
+
+    def get_waveform_transforms(self, split, is_train):
+        cfg = deepcopy(self.config)
+        cfg["waveform_transforms"] = self.get_transforms("waveform_", split, is_train)
+        return cfg
+
+    def get_dataset_transforms(self, split, is_train):
+        cfg = deepcopy(self.config)
+        cfg["dataset_transforms"] = self.get_transforms("dataset_", split, is_train)
+        return cfg
+
+    @property
+    def global_cmvn_stats_npz(self) -> Optional[str]:
+        path = self.config.get("global_cmvn", {}).get("stats_npz_path", None)
+        return self._auto_convert_to_abs_path(path)
+
+    @property
+    def vocoder(self) -> Dict[str, str]:
+        vocoder = self.config.get("vocoder", {"type": "griffin_lim"})
+        return self._auto_convert_to_abs_path(vocoder)
+
+    @property
+    def hub(self) -> Dict[str, str]:
+        return self.config.get("hub", {})
+
+
+class S2SDataConfig(S2TDataConfig):
+    """Wrapper class for data config YAML"""
+
+    @property
+    def vocab_filename(self):
+        """fairseq vocabulary file under data root"""
+        return self.config.get("vocab_filename", None)
+
+    @property
+    def pre_tokenizer(self) -> Dict:
+        return None
+
+    @property
+    def bpe_tokenizer(self) -> Dict:
+        return None
+
+    @property
+    def input_transformed_channels(self):
+        """The number of channels in the audio after feature transforms"""
+        # TODO: move this into individual transforms
+        # TODO: deprecate transforms
+        _cur = self.config.get("transforms", {})
+        ft_transforms = self.config.get("feature_transforms", {})
+        if _cur and ft_transforms:
+            _cur.update(ft_transforms)
+        else:
+            _cur = self.config.get("feature_transforms", {})
+        cur = _cur.get("_train", [])
+
+        _channels = self.input_channels
+        if "delta_deltas" in cur:
+            _channels *= 3
+
+        return _channels
+
+    @property
+    def output_sample_rate(self):
+        """The audio sample rate of output target speech"""
+        return self.config.get("output_sample_rate", 22050)
+
+    @property
+    def target_speaker_embed(self):
+        """Target speaker embedding file (one line per target audio sample)"""
+        return self.config.get("target_speaker_embed", None)
+
+    @property
+    def prepend_tgt_lang_tag_as_bos(self) -> bool:
+        """Prepend target lang ID token as the target BOS."""
+        return self.config.get("prepend_tgt_lang_tag_as_bos", False)
+
+
+class MultitaskConfig(object):
+    """Wrapper class for data config YAML"""
+
+    def __init__(self, yaml_path: Path):
+        config = get_config_from_yaml(yaml_path)
+        self.config = {}
+        for k, v in config.items():
+            self.config[k] = SingleTaskConfig(k, v)
+
+    def get_all_tasks(self):
+        return self.config
+
+    def get_single_task(self, name):
+        assert name in self.config, f"multitask '{name}' does not exist!"
+        return self.config[name]
+
+    @property
+    def first_pass_decoder_task_index(self):
+        """Return the task index of the first-pass text decoder.
+        If there are multiple 'is_first_pass_decoder: True' in the config file,
+            the last task is used for the first-pass decoder.
+        If there is no 'is_first_pass_decoder: True' in the config file,
+            the last task whose task_name includes 'target' and decoder_type is not ctc.
+        """
+        idx = -1
+        for i, (k, v) in enumerate(self.config.items()):
+            if v.is_first_pass_decoder:
+                idx = i
+        if idx < 0:
+            for i, (k, v) in enumerate(self.config.items()):
+                if k.startswith("target") and v.decoder_type == "transformer":
+                    idx = i
+        return idx
+
+
+class SingleTaskConfig(object):
+    def __init__(self, name, config):
+        self.task_name = name
+        self.config = config
+        dict_path = config.get("dict", "")
+        self.tgt_dict = Dictionary.load(dict_path) if Path(dict_path).exists() else None
+
+    @property
+    def data(self):
+        return self.config.get("data", "")
+
+    @property
+    def decoder_type(self):
+        return self.config.get("decoder_type", "transformer")
+
+    @property
+    def decoder_args(self):
+        """Decoder arch related args"""
+        args = self.config.get("decoder_args", {})
+        return Namespace(**args)
+
+    @property
+    def criterion_cfg(self):
+        """cfg for the multitask criterion"""
+        if self.decoder_type == "ctc":
+            from fairseq.criterions.ctc import CtcCriterionConfig
+
+            cfg = CtcCriterionConfig
+            cfg.zero_infinity = self.config.get("zero_infinity", True)
+        else:
+            from fairseq.criterions.label_smoothed_cross_entropy import (
+                LabelSmoothedCrossEntropyCriterionConfig,
+            )
+
+            cfg = LabelSmoothedCrossEntropyCriterionConfig
+            cfg.label_smoothing = self.config.get("label_smoothing", 0.2)
+        return cfg
+
+    @property
+    def input_from(self):
+        """Condition on encoder/decoder of the main model"""
+        return "decoder" if "decoder_layer" in self.config else "encoder"
+
+    @property
+    def input_layer(self):
+        if self.input_from == "decoder":
+            return self.config["decoder_layer"] - 1
+        else:
+            # default using the output from the last encoder layer (-1)
+            return self.config.get("encoder_layer", 0) - 1
+
+    @property
+    def loss_weight_schedule(self):
+        return (
+            "decay"
+            if "loss_weight_max" in self.config
+            and "loss_weight_decay_steps" in self.config
+            else "fixed"
+        )
+
+    def get_loss_weight(self, num_updates):
+        if self.loss_weight_schedule == "fixed":
+            weight = self.config.get("loss_weight", 1.0)
+        else:  # "decay"
+            assert (
+                self.config.get("loss_weight_decay_steps", 0) > 0
+            ), "loss_weight_decay_steps must be greater than 0 for a decay schedule"
+            loss_weight_min = self.config.get("loss_weight_min", 0.0001)
+            loss_weight_decay_stepsize = (
+                self.config["loss_weight_max"] - loss_weight_min
+            ) / self.config["loss_weight_decay_steps"]
+            weight = max(
+                self.config["loss_weight_max"]
+                - loss_weight_decay_stepsize * num_updates,
+                loss_weight_min,
+            )
+        return weight
+
+    @property
+    def prepend_bos_and_append_tgt_lang_tag(self) -> bool:
+        """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining)."""
+        return self.config.get("prepend_bos_and_append_tgt_lang_tag", False)
+
+    @property
+    def eos_token(self):
+        """EOS token during generation"""
+        return self.config.get("eos_token", "<eos>")
+
+    @property
+    def rdrop_alpha(self):
+        return self.config.get("rdrop_alpha", 0.0)
+
+    @property
+    def is_first_pass_decoder(self):
+        flag = self.config.get("is_first_pass_decoder", False)
+        if flag:
+            if self.decoder_type == "ctc":
+                raise ValueError(
+                    "First-pass decoder in the multi-decoder model must not be CTC."
+                )
+            if "target" not in self.task_name:
+                raise Warning(
+                    'The name of the first-pass decoder does not include "target".'
+                )
+        return flag
+
+    @property
+    def get_lang_tag_mapping(self):
+        return self.config.get("lang_tag_mapping", {})
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py
new file mode 100644
index 0000000000..b24c6f731f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py
@@ -0,0 +1,53 @@
+import os
+from fairseq.data.audio import (
+    AudioTransform,
+    CompositeAudioTransform,
+    import_transforms,
+    register_audio_transform,
+)
+
+
+class AudioDatasetTransform(AudioTransform):
+    pass
+
+
+AUDIO_DATASET_TRANSFORM_REGISTRY = {}
+AUDIO_DATASET_TRANSFORM_CLASS_NAMES = set()
+
+
+def get_audio_dataset_transform(name):
+    return AUDIO_DATASET_TRANSFORM_REGISTRY[name]
+
+
+def register_audio_dataset_transform(name):
+    return register_audio_transform(
+        name,
+        AudioDatasetTransform,
+        AUDIO_DATASET_TRANSFORM_REGISTRY,
+        AUDIO_DATASET_TRANSFORM_CLASS_NAMES,
+    )
+
+
+import_transforms(os.path.dirname(__file__), "dataset")
+
+
+class CompositeAudioDatasetTransform(CompositeAudioTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        return super()._from_config_dict(
+            cls,
+            "dataset",
+            get_audio_dataset_transform,
+            CompositeAudioDatasetTransform,
+            config,
+            return_empty=True,
+        )
+
+    def get_transform(self, cls):
+        for t in self.transforms:
+            if isinstance(t, cls):
+                return t
+        return None
+
+    def has_transform(self, cls):
+        return self.get_transform(cls) is not None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py
new file mode 100644
index 0000000000..0b632ccf2b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py
@@ -0,0 +1,61 @@
+from typing import List
+import numpy as np
+
+from fairseq.data.audio.dataset_transforms import (
+    AudioDatasetTransform,
+    register_audio_dataset_transform,
+)
+
+_DEFAULTS = {"rate": 0.25, "max_tokens": 3000, "attempts": 5}
+
+
+@register_audio_dataset_transform("concataugment")
+class ConcatAugment(AudioDatasetTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return ConcatAugment(
+            _config.get("rate", _DEFAULTS["rate"]),
+            _config.get("max_tokens", _DEFAULTS["max_tokens"]),
+            _config.get("attempts", _DEFAULTS["attempts"]),
+        )
+
+    def __init__(
+        self,
+        rate=_DEFAULTS["rate"],
+        max_tokens=_DEFAULTS["max_tokens"],
+        attempts=_DEFAULTS["attempts"],
+    ):
+        self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"rate={self.rate}",
+                    f"max_tokens={self.max_tokens}",
+                    f"attempts={self.attempts}",
+                ]
+            )
+            + ")"
+        )
+
+    def find_indices(self, index: int, n_frames: List[int], n_samples: int):
+        # skip conditions: application rate, max_tokens limit exceeded
+        if np.random.random() > self.rate:
+            return [index]
+        if self.max_tokens and n_frames[index] > self.max_tokens:
+            return [index]
+
+        # pick second sample to concatenate
+        for _ in range(self.attempts):
+            index2 = np.random.randint(0, n_samples)
+            if index2 != index and (
+                not self.max_tokens
+                or n_frames[index] + n_frames[index2] < self.max_tokens
+            ):
+                return [index, index2]
+
+        return [index]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py
new file mode 100644
index 0000000000..e9ebec2388
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py
@@ -0,0 +1,105 @@
+import numpy as np
+import torch
+
+from fairseq.data.audio import rand_uniform
+from fairseq.data.audio.dataset_transforms import (
+    AudioDatasetTransform,
+    register_audio_dataset_transform,
+)
+from fairseq.data.audio.waveform_transforms.noiseaugment import (
+    NoiseAugmentTransform,
+)
+
+_DEFAULTS = {
+    "rate": 0.25,
+    "mixing_noise_rate": 0.1,
+    "noise_path": "",
+    "noise_snr_min": -5,
+    "noise_snr_max": 5,
+    "utterance_snr_min": -5,
+    "utterance_snr_max": 5,
+}
+
+
+@register_audio_dataset_transform("noisyoverlapaugment")
+class NoisyOverlapAugment(AudioDatasetTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return NoisyOverlapAugment(
+            _config.get("rate", _DEFAULTS["rate"]),
+            _config.get("mixing_noise_rate", _DEFAULTS["mixing_noise_rate"]),
+            _config.get("noise_path", _DEFAULTS["noise_path"]),
+            _config.get("noise_snr_min", _DEFAULTS["noise_snr_min"]),
+            _config.get("noise_snr_max", _DEFAULTS["noise_snr_max"]),
+            _config.get("utterance_snr_min", _DEFAULTS["utterance_snr_min"]),
+            _config.get("utterance_snr_max", _DEFAULTS["utterance_snr_max"]),
+        )
+
+    def __init__(
+        self,
+        rate=_DEFAULTS["rate"],
+        mixing_noise_rate=_DEFAULTS["mixing_noise_rate"],
+        noise_path=_DEFAULTS["noise_path"],
+        noise_snr_min=_DEFAULTS["noise_snr_min"],
+        noise_snr_max=_DEFAULTS["noise_snr_max"],
+        utterance_snr_min=_DEFAULTS["utterance_snr_min"],
+        utterance_snr_max=_DEFAULTS["utterance_snr_max"],
+    ):
+        self.rate = rate
+        self.mixing_noise_rate = mixing_noise_rate
+        self.noise_shaper = NoiseAugmentTransform(noise_path)
+        self.noise_snr_min = noise_snr_min
+        self.noise_snr_max = noise_snr_max
+        self.utterance_snr_min = utterance_snr_min
+        self.utterance_snr_max = utterance_snr_max
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"rate={self.rate}",
+                    f"mixing_noise_rate={self.mixing_noise_rate}",
+                    f"noise_snr_min={self.noise_snr_min}",
+                    f"noise_snr_max={self.noise_snr_max}",
+                    f"utterance_snr_min={self.utterance_snr_min}",
+                    f"utterance_snr_max={self.utterance_snr_max}",
+                ]
+            )
+            + ")"
+        )
+
+    def __call__(self, sources):
+        for i, source in enumerate(sources):
+            if np.random.random() > self.rate:
+                continue
+
+            pri = source.numpy()
+
+            if np.random.random() > self.mixing_noise_rate:
+                sec = sources[np.random.randint(0, len(sources))].numpy()
+                snr = rand_uniform(self.utterance_snr_min, self.utterance_snr_max)
+            else:
+                sec = self.noise_shaper.pick_sample(source.shape)
+                snr = rand_uniform(self.noise_snr_min, self.noise_snr_max)
+
+            L1 = pri.shape[-1]
+            L2 = sec.shape[-1]
+            l = np.random.randint(0, min(round(L1 / 2), L2))  # mix len
+            s_source = np.random.randint(0, L1 - l)
+            s_sec = np.random.randint(0, L2 - l)
+
+            get_power = lambda x: np.mean(x**2)
+            if get_power(sec) == 0:
+                continue
+
+            scl = np.sqrt(get_power(pri) / (np.power(10, snr / 10) * get_power(sec)))
+
+            pri[s_source : s_source + l] = np.add(
+                pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l])
+            )
+            sources[i] = torch.from_numpy(pri).float()
+
+        return sources
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py
new file mode 100644
index 0000000000..d295013b90
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py
@@ -0,0 +1,43 @@
+import os
+from fairseq.data.audio import (
+    AudioTransform,
+    CompositeAudioTransform,
+    import_transforms,
+    register_audio_transform,
+)
+
+
+class AudioFeatureTransform(AudioTransform):
+    pass
+
+
+AUDIO_FEATURE_TRANSFORM_REGISTRY = {}
+AUDIO_FEATURE_TRANSFORM_CLASS_NAMES = set()
+
+
+def get_audio_feature_transform(name):
+    return AUDIO_FEATURE_TRANSFORM_REGISTRY[name]
+
+
+def register_audio_feature_transform(name):
+    return register_audio_transform(
+        name,
+        AudioFeatureTransform,
+        AUDIO_FEATURE_TRANSFORM_REGISTRY,
+        AUDIO_FEATURE_TRANSFORM_CLASS_NAMES,
+    )
+
+
+import_transforms(os.path.dirname(__file__), "feature")
+
+
+class CompositeAudioFeatureTransform(CompositeAudioTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        return super()._from_config_dict(
+            cls,
+            "feature",
+            get_audio_feature_transform,
+            CompositeAudioFeatureTransform,
+            config,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py
new file mode 100644
index 0000000000..49d090b11e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py
@@ -0,0 +1,37 @@
+import numpy as np
+import torch
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+
+
+@register_audio_feature_transform("delta_deltas")
+class DeltaDeltas(AudioFeatureTransform):
+    """Expand delta-deltas features from spectrum."""
+
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return DeltaDeltas(_config.get("win_length", 5))
+
+    def __init__(self, win_length=5):
+        self.win_length = win_length
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+    def __call__(self, spectrogram):
+        from torchaudio.functional import compute_deltas
+
+        assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor."
+        # spectrogram is T x F, while compute_deltas takes (…, F, T)
+        spectrogram = torch.from_numpy(spectrogram).transpose(0, 1)
+        delta = compute_deltas(spectrogram)
+        delta_delta = compute_deltas(delta)
+
+        out_feat = np.concatenate(
+            [spectrogram, delta.numpy(), delta_delta.numpy()], axis=0
+        )
+        out_feat = np.transpose(out_feat)
+        return out_feat
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py
new file mode 100644
index 0000000000..e457ff176f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py
@@ -0,0 +1,29 @@
+import numpy as np
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+
+
+@register_audio_feature_transform("global_cmvn")
+class GlobalCMVN(AudioFeatureTransform):
+    """Global CMVN (cepstral mean and variance normalization). The global mean
+    and variance need to be pre-computed and stored in NumPy format (.npz)."""
+
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return GlobalCMVN(_config.get("stats_npz_path"))
+
+    def __init__(self, stats_npz_path):
+        self.stats_npz_path = stats_npz_path
+        stats = np.load(stats_npz_path)
+        self.mean, self.std = stats["mean"], stats["std"]
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(stats_npz_path="{self.stats_npz_path}")'
+
+    def __call__(self, x):
+        x = np.subtract(x, self.mean)
+        x = np.divide(x, self.std)
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py
new file mode 100644
index 0000000000..ce5802b41a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py
@@ -0,0 +1,131 @@
+import math
+import numbers
+from typing import Optional
+
+import numpy as np
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+
+
+@register_audio_feature_transform("specaugment")
+class SpecAugmentTransform(AudioFeatureTransform):
+    """SpecAugment (https://arxiv.org/abs/1904.08779)"""
+
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return SpecAugmentTransform(
+            _config.get("time_warp_W", 0),
+            _config.get("freq_mask_N", 0),
+            _config.get("freq_mask_F", 0),
+            _config.get("time_mask_N", 0),
+            _config.get("time_mask_T", 0),
+            _config.get("time_mask_p", 0.0),
+            _config.get("mask_value", None),
+        )
+
+    def __init__(
+        self,
+        time_warp_w: int = 0,
+        freq_mask_n: int = 0,
+        freq_mask_f: int = 0,
+        time_mask_n: int = 0,
+        time_mask_t: int = 0,
+        time_mask_p: float = 0.0,
+        mask_value: Optional[float] = 0.0,
+    ):
+        # Sanity checks
+        assert mask_value is None or isinstance(
+            mask_value, numbers.Number
+        ), f"mask_value (type: {type(mask_value)}) must be None or a number"
+        if freq_mask_n > 0:
+            assert freq_mask_f > 0, (
+                f"freq_mask_F ({freq_mask_f}) "
+                f"must be larger than 0 when doing freq masking."
+            )
+        if time_mask_n > 0:
+            assert time_mask_t > 0, (
+                f"time_mask_T ({time_mask_t}) must be larger than 0 when "
+                f"doing time masking."
+            )
+
+        self.time_warp_w = time_warp_w
+        self.freq_mask_n = freq_mask_n
+        self.freq_mask_f = freq_mask_f
+        self.time_mask_n = time_mask_n
+        self.time_mask_t = time_mask_t
+        self.time_mask_p = time_mask_p
+        self.mask_value = mask_value
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"time_warp_w={self.time_warp_w}",
+                    f"freq_mask_n={self.freq_mask_n}",
+                    f"freq_mask_f={self.freq_mask_f}",
+                    f"time_mask_n={self.time_mask_n}",
+                    f"time_mask_t={self.time_mask_t}",
+                    f"time_mask_p={self.time_mask_p}",
+                ]
+            )
+            + ")"
+        )
+
+    def __call__(self, spectrogram):
+        assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor."
+
+        distorted = spectrogram.copy()  # make a copy of input spectrogram.
+        num_frames = spectrogram.shape[0]  # or 'tau' in the paper.
+        num_freqs = spectrogram.shape[1]  # or 'miu' in the paper.
+        mask_value = self.mask_value
+
+        if mask_value is None:  # if no value was specified, use local mean.
+            mask_value = spectrogram.mean()
+
+        if num_frames == 0:
+            return spectrogram
+
+        if num_freqs < self.freq_mask_f:
+            return spectrogram
+
+        if self.time_warp_w > 0:
+            if 2 * self.time_warp_w < num_frames:
+                import cv2
+
+                w0 = np.random.randint(self.time_warp_w, num_frames - self.time_warp_w)
+                w = np.random.randint(-self.time_warp_w + 1, self.time_warp_w)
+                upper, lower = distorted[:w0, :], distorted[w0:, :]
+                upper = cv2.resize(
+                    upper, dsize=(num_freqs, w0 + w), interpolation=cv2.INTER_LINEAR
+                )
+                lower = cv2.resize(
+                    lower,
+                    dsize=(num_freqs, num_frames - w0 - w),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+                distorted = np.concatenate((upper, lower), axis=0)
+
+        for _i in range(self.freq_mask_n):
+            f = np.random.randint(0, self.freq_mask_f)
+            f0 = np.random.randint(0, num_freqs - f)
+            if f != 0:
+                distorted[:, f0 : f0 + f] = mask_value
+
+        max_time_mask_t = min(
+            self.time_mask_t, math.floor(num_frames * self.time_mask_p)
+        )
+        if max_time_mask_t < 1:
+            return distorted
+
+        for _i in range(self.time_mask_n):
+            t = np.random.randint(0, max_time_mask_t)
+            t0 = np.random.randint(0, num_frames - t)
+            if t != 0:
+                distorted[t0 : t0 + t, :] = mask_value
+
+        return distorted
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
new file mode 100644
index 0000000000..37637bc09a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+from fairseq.data.audio.feature_transforms import (
+    AudioFeatureTransform,
+    register_audio_feature_transform,
+)
+
+
+@register_audio_feature_transform("utterance_cmvn")
+class UtteranceCMVN(AudioFeatureTransform):
+    """Utterance-level CMVN (cepstral mean and variance normalization)"""
+
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return UtteranceCMVN(
+            _config.get("norm_means", True),
+            _config.get("norm_vars", True),
+        )
+
+    def __init__(self, norm_means=True, norm_vars=True):
+        self.norm_means, self.norm_vars = norm_means, norm_vars
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + f"(norm_means={self.norm_means}, norm_vars={self.norm_vars})"
+        )
+
+    def __call__(self, x):
+        mean = x.mean(axis=0)
+        square_sums = (x**2).sum(axis=0)
+
+        if self.norm_means:
+            x = np.subtract(x, mean)
+        if self.norm_vars:
+            var = square_sums / x.shape[0] - mean**2
+            std = np.sqrt(np.maximum(var, 1e-10))
+            x = np.divide(x, std)
+
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py
new file mode 100644
index 0000000000..b54654d492
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.abs
+
+import csv
+import logging
+import os.path as op
+from typing import List, Optional
+
+import numpy as np
+import torch
+from fairseq.data import Dictionary
+from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig
+from fairseq.data.audio.text_to_speech_dataset import (
+    TextToSpeechDataset,
+    TextToSpeechDatasetCreator,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class FrmTextToSpeechDataset(TextToSpeechDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        data_cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        do_chunk=False,
+        chunk_bound=-1,
+        chunk_init=50,
+        chunk_incr=5,
+        add_eos=True,
+        dedup=True,
+        ref_fpu=-1,
+    ):
+        # It assumes texts are encoded at a fixed frame-rate
+        super().__init__(
+            split=split,
+            is_train_split=is_train_split,
+            data_cfg=data_cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+
+        self.do_chunk = do_chunk
+        self.chunk_bound = chunk_bound
+        self.chunk_init = chunk_init
+        self.chunk_incr = chunk_incr
+        self.add_eos = add_eos
+        self.dedup = dedup
+        self.ref_fpu = ref_fpu
+
+        self.chunk_size = -1
+
+        if do_chunk:
+            assert self.chunk_incr >= 0
+            assert self.pre_tokenizer is None
+
+    def __getitem__(self, index):
+        index, source, target, speaker_id, _, _, _ = super().__getitem__(index)
+        if target[-1].item() == self.tgt_dict.eos_index:
+            target = target[:-1]
+
+        fpu = source.size(0) / target.size(0)  # frame-per-unit
+        fps = self.n_frames_per_step
+        assert (
+            self.ref_fpu == -1 or abs((fpu * fps - self.ref_fpu) / self.ref_fpu) < 0.1
+        ), f"{fpu*fps} != {self.ref_fpu}"
+
+        # only chunk training split
+        if self.is_train_split and self.do_chunk and self.chunk_size > 0:
+            lang = target[: int(self.data_cfg.prepend_tgt_lang_tag)]
+            text = target[int(self.data_cfg.prepend_tgt_lang_tag) :]
+            size = len(text)
+            chunk_size = min(self.chunk_size, size)
+            chunk_start = np.random.randint(size - chunk_size + 1)
+            text = text[chunk_start : chunk_start + chunk_size]
+            target = torch.cat((lang, text), 0)
+
+            f_size = int(np.floor(chunk_size * fpu))
+            f_start = int(np.floor(chunk_start * fpu))
+            assert f_size > 0
+            source = source[f_start : f_start + f_size, :]
+
+        if self.dedup:
+            target = torch.unique_consecutive(target)
+
+        if self.add_eos:
+            eos_idx = self.tgt_dict.eos_index
+            target = torch.cat((target, torch.LongTensor([eos_idx])), 0)
+
+        return index, source, target, speaker_id
+
+    def set_epoch(self, epoch):
+        if self.is_train_split and self.do_chunk:
+            old = self.chunk_size
+            self.chunk_size = self.chunk_init + epoch * self.chunk_incr
+            if self.chunk_bound > 0:
+                self.chunk_size = min(self.chunk_size, self.chunk_bound)
+            logger.info(
+                (
+                    f"{self.split}: setting chunk size "
+                    f"from {old} to {self.chunk_size}"
+                )
+            )
+
+
+class FrmTextToSpeechDatasetCreator(TextToSpeechDatasetCreator):
+    # inherit for key names
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        data_cfg: S2TDataConfig,
+        split: str,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        is_train_split: bool,
+        n_frames_per_step: int,
+        speaker_to_id,
+        do_chunk: bool = False,
+        chunk_bound: int = -1,
+        chunk_init: int = 50,
+        chunk_incr: int = 5,
+        add_eos: bool = True,
+        dedup: bool = True,
+        ref_fpu: float = -1,
+    ) -> FrmTextToSpeechDataset:
+        tsv_path = op.join(root, f"{split}.tsv")
+        if not op.isfile(tsv_path):
+            raise FileNotFoundError(f"Dataset not found: {tsv_path}")
+        with open(tsv_path) as f:
+            reader = csv.DictReader(
+                f,
+                delimiter="\t",
+                quotechar=None,
+                doublequote=False,
+                lineterminator="\n",
+                quoting=csv.QUOTE_NONE,
+            )
+            s = [dict(e) for e in reader]
+            assert len(s) > 0
+
+        ids = [ss[cls.KEY_ID] for ss in s]
+        audio_paths = [op.join(data_cfg.audio_root, ss[cls.KEY_AUDIO]) for ss in s]
+        n_frames = [int(ss[cls.KEY_N_FRAMES]) for ss in s]
+        tgt_texts = [ss[cls.KEY_TGT_TEXT] for ss in s]
+        src_texts = [ss.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for ss in s]
+        speakers = [ss.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for ss in s]
+        src_langs = [ss.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for ss in s]
+        tgt_langs = [ss.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for ss in s]
+
+        return FrmTextToSpeechDataset(
+            split=split,
+            is_train_split=is_train_split,
+            data_cfg=data_cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+            do_chunk=do_chunk,
+            chunk_bound=chunk_bound,
+            chunk_init=chunk_init,
+            chunk_incr=chunk_incr,
+            add_eos=add_eos,
+            dedup=dedup,
+            ref_fpu=ref_fpu,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
new file mode 100644
index 0000000000..f09b065fdc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
@@ -0,0 +1,356 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import os
+import sys
+from typing import Any, List, Optional, Union
+
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+from fairseq.data import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+from fairseq.data.audio.audio_utils import (
+    parse_path,
+    read_from_stored_zip,
+)
+import io
+
+logger = logging.getLogger(__name__)
+
+
+def load_audio(manifest_path, max_keep, min_keep):
+    n_long, n_short = 0, 0
+    names, inds, sizes = [], [], []
+    with open(manifest_path) as f:
+        root = f.readline().strip()
+        for ind, line in enumerate(f):
+            items = line.strip().split("\t")
+            assert len(items) == 2, line
+            sz = int(items[1])
+            if min_keep is not None and sz < min_keep:
+                n_short += 1
+            elif max_keep is not None and sz > max_keep:
+                n_long += 1
+            else:
+                names.append(items[0])
+                inds.append(ind)
+                sizes.append(sz)
+    tot = ind + 1
+    logger.info(
+        (
+            f"max_keep={max_keep}, min_keep={min_keep}, "
+            f"loaded {len(names)}, skipped {n_short} short and {n_long} long, "
+            f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}"
+        )
+    )
+    return root, names, inds, tot, sizes
+
+
+def load_label(label_path, inds, tot):
+    with open(label_path) as f:
+        labels = [line.rstrip() for line in f]
+        assert (
+            len(labels) == tot
+        ), f"number of labels does not match ({len(labels)} != {tot})"
+        labels = [labels[i] for i in inds]
+    return labels
+
+
+def load_label_offset(label_path, inds, tot):
+    with open(label_path) as f:
+        code_lengths = [len(line.encode("utf-8")) for line in f]
+        assert (
+            len(code_lengths) == tot
+        ), f"number of labels does not match ({len(code_lengths)} != {tot})"
+        offsets = list(itertools.accumulate([0] + code_lengths))
+        offsets = [(offsets[i], offsets[i + 1]) for i in inds]
+    return offsets
+
+
+def verify_label_lengths(
+    audio_sizes,
+    audio_rate,
+    label_path,
+    label_rate,
+    inds,
+    tot,
+    tol=0.1,  # tolerance in seconds
+):
+    if label_rate < 0:
+        logger.info(f"{label_path} is sequence label. skipped")
+        return
+
+    with open(label_path) as f:
+        lengths = [len(line.rstrip().split()) for line in f]
+        assert len(lengths) == tot
+        lengths = [lengths[i] for i in inds]
+    num_invalid = 0
+    for i, ind in enumerate(inds):
+        dur_from_audio = audio_sizes[i] / audio_rate
+        dur_from_label = lengths[i] / label_rate
+        if abs(dur_from_audio - dur_from_label) > tol:
+            logger.warning(
+                (
+                    f"audio and label duration differ too much "
+                    f"(|{dur_from_audio} - {dur_from_label}| > {tol}) "
+                    f"in line {ind+1} of {label_path}. Check if `label_rate` "
+                    f"is correctly set (currently {label_rate}). "
+                    f"num. of samples = {audio_sizes[i]}; "
+                    f"label length = {lengths[i]}"
+                )
+            )
+            num_invalid += 1
+    if num_invalid > 0:
+        logger.warning(
+            f"total {num_invalid} (audio, label) pairs with mismatched lengths"
+        )
+
+
+class HubertDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest_path: str,
+        sample_rate: float,
+        label_paths: List[str],
+        label_rates: Union[List[float], float],  # -1 for sequence labels
+        pad_list: List[str],
+        eos_list: List[str],
+        label_processors: Optional[List[Any]] = None,
+        max_keep_sample_size: Optional[int] = None,
+        min_keep_sample_size: Optional[int] = None,
+        max_sample_size: Optional[int] = None,
+        shuffle: bool = True,
+        pad_audio: bool = False,
+        normalize: bool = False,
+        store_labels: bool = True,
+        random_crop: bool = False,
+        single_target: bool = False,
+    ):
+        self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio(
+            manifest_path, max_keep_sample_size, min_keep_sample_size
+        )
+        self.sample_rate = sample_rate
+        self.shuffle = shuffle
+        self.random_crop = random_crop
+
+        self.num_labels = len(label_paths)
+        self.pad_list = pad_list
+        self.eos_list = eos_list
+        self.label_processors = label_processors
+        self.single_target = single_target
+        self.label_rates = (
+            [label_rates for _ in range(len(label_paths))]
+            if isinstance(label_rates, float)
+            else label_rates
+        )
+        self.store_labels = store_labels
+        if store_labels:
+            self.label_list = [load_label(p, inds, tot) for p in label_paths]
+        else:
+            self.label_paths = label_paths
+            self.label_offsets_list = [
+                load_label_offset(p, inds, tot) for p in label_paths
+            ]
+        assert label_processors is None or len(label_processors) == self.num_labels
+        for label_path, label_rate in zip(label_paths, self.label_rates):
+            verify_label_lengths(
+                self.sizes, sample_rate, label_path, label_rate, inds, tot
+            )
+
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.pad_audio = pad_audio
+        self.normalize = normalize
+        logger.info(
+            f"pad_audio={pad_audio}, random_crop={random_crop}, "
+            f"normalize={normalize}, max_sample_size={self.max_sample_size}"
+        )
+
+    def get_audio(self, index):
+        import soundfile as sf
+
+        wav_path = os.path.join(self.audio_root, self.audio_names[index])
+        _path, slice_ptr = parse_path(wav_path)
+        if len(slice_ptr) == 0:
+            wav, cur_sample_rate = sf.read(_path)
+        else:
+            assert _path.endswith(".zip")
+            data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
+            f = io.BytesIO(data)
+            wav, cur_sample_rate = sf.read(f)
+        wav = torch.from_numpy(wav).float()
+        wav = self.postprocess(wav, cur_sample_rate)
+        return wav
+
+    def get_label(self, index, label_idx):
+        if self.store_labels:
+            label = self.label_list[label_idx][index]
+        else:
+            with open(self.label_paths[label_idx]) as f:
+                offset_s, offset_e = self.label_offsets_list[label_idx][index]
+                f.seek(offset_s)
+                label = f.read(offset_e - offset_s)
+
+        if self.label_processors is not None:
+            label = self.label_processors[label_idx](label)
+        return label
+
+    def get_labels(self, index):
+        return [self.get_label(index, i) for i in range(self.num_labels)]
+
+    def __getitem__(self, index):
+        wav = self.get_audio(index)
+        labels = self.get_labels(index)
+        return {"id": index, "source": wav, "label_list": labels}
+
+    def __len__(self):
+        return len(self.sizes)
+
+    def crop_to_max_size(self, wav, target_size):
+        size = len(wav)
+        diff = size - target_size
+        if diff <= 0:
+            return wav, 0
+
+        start, end = 0, target_size
+        if self.random_crop:
+            start = np.random.randint(0, diff + 1)
+            end = size - diff + start
+        return wav[start:end], start
+
+    def collater(self, samples):
+        # target = max(sizes) -> random_crop not used
+        # target = max_sample_size -> random_crop used for long
+        samples = [s for s in samples if s["source"] is not None]
+        if len(samples) == 0:
+            return {}
+
+        audios = [s["source"] for s in samples]
+        audio_sizes = [len(s) for s in audios]
+        if self.pad_audio:
+            audio_size = min(max(audio_sizes), self.max_sample_size)
+        else:
+            audio_size = min(min(audio_sizes), self.max_sample_size)
+        collated_audios, padding_mask, audio_starts = self.collater_audio(
+            audios, audio_size
+        )
+
+        targets_by_label = [
+            [s["label_list"][i] for s in samples] for i in range(self.num_labels)
+        ]
+        targets_list, lengths_list, ntokens_list = self.collater_label(
+            targets_by_label, audio_size, audio_starts
+        )
+
+        net_input = {"source": collated_audios, "padding_mask": padding_mask}
+        batch = {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "net_input": net_input,
+        }
+
+        if self.single_target:
+            batch["target_lengths"] = lengths_list[0]
+            batch["ntokens"] = ntokens_list[0]
+            batch["target"] = targets_list[0]
+        else:
+            batch["target_lengths_list"] = lengths_list
+            batch["ntokens_list"] = ntokens_list
+            batch["target_list"] = targets_list
+        return batch
+
+    def collater_audio(self, audios, audio_size):
+        collated_audios = audios[0].new_zeros(len(audios), audio_size)
+        padding_mask = (
+            torch.BoolTensor(collated_audios.shape).fill_(False)
+            # if self.pad_audio else None
+        )
+        audio_starts = [0 for _ in audios]
+        for i, audio in enumerate(audios):
+            diff = len(audio) - audio_size
+            if diff == 0:
+                collated_audios[i] = audio
+            elif diff < 0:
+                assert self.pad_audio
+                collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)])
+                padding_mask[i, diff:] = True
+            else:
+                collated_audios[i], audio_starts[i] = self.crop_to_max_size(
+                    audio, audio_size
+                )
+        return collated_audios, padding_mask, audio_starts
+
+    def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad):
+        assert label_rate > 0
+        s2f = label_rate / self.sample_rate
+        frm_starts = [int(round(s * s2f)) for s in audio_starts]
+        frm_size = int(round(audio_size * s2f))
+        if not self.pad_audio:
+            rem_size = [len(t) - s for t, s in zip(targets, frm_starts)]
+            frm_size = min(frm_size, *rem_size)
+        targets = [t[s : s + frm_size] for t, s in zip(targets, frm_starts)]
+        logger.debug(f"audio_starts={audio_starts}")
+        logger.debug(f"frame_starts={frm_starts}")
+        logger.debug(f"frame_size={frm_size}")
+
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False)
+        return targets, lengths, ntokens
+
+    def collater_seq_label(self, targets, pad):
+        lengths = torch.LongTensor([len(t) for t in targets])
+        ntokens = lengths.sum().item()
+        targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False)
+        return targets, lengths, ntokens
+
+    def collater_label(self, targets_by_label, audio_size, audio_starts):
+        targets_list, lengths_list, ntokens_list = [], [], []
+        itr = zip(targets_by_label, self.label_rates, self.pad_list)
+        for targets, label_rate, pad in itr:
+            if label_rate == -1.0:
+                targets, lengths, ntokens = self.collater_seq_label(targets, pad)
+            else:
+                targets, lengths, ntokens = self.collater_frm_label(
+                    targets, audio_size, audio_starts, label_rate, pad
+                )
+            targets_list.append(targets)
+            lengths_list.append(lengths)
+            ntokens_list.append(ntokens)
+        return targets_list, lengths_list, ntokens_list
+
+    def num_tokens(self, index):
+        return self.size(index)
+
+    def size(self, index):
+        if self.pad_audio:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+
+        order.append(self.sizes)
+        return np.lexsort(order)[::-1]
+
+    def postprocess(self, wav, cur_sample_rate):
+        if wav.dim() == 2:
+            wav = wav.mean(-1)
+        assert wav.dim() == 1, wav.dim()
+
+        if cur_sample_rate != self.sample_rate:
+            raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}")
+
+        if self.normalize:
+            with torch.no_grad():
+                wav = F.layer_norm(wav, wav.shape)
+        return wav
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py
new file mode 100644
index 0000000000..0a42c10611
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2021-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import math
+from typing import List, Optional, NamedTuple
+
+import numpy as np
+import torch
+from fairseq.data import (
+    ConcatDataset,
+    LanguagePairDataset,
+    FileAudioDataset,
+    data_utils,
+)
+from fairseq.data import FairseqDataset
+
+logger = logging.getLogger(__name__)
+
+
+class ModalityDatasetItem(NamedTuple):
+    datasetname: str
+    dataset: any
+    max_positions: List[int]
+    max_tokens: Optional[int] = None
+    max_sentences: Optional[int] = None
+
+
+# MultiModalityDataset: it concate multiple datasets with different modalities.
+# Compared with ConcatDataset it can 1) sample data given the ratios for different datasets
+# 2) it adds mode to indicate what type of the data samples come from.
+# It will be used with GroupedEpochBatchIterator together to generate mini-batch with samples
+# from the same type of dataset
+# If only one dataset is used, it will perform like the original dataset with mode added
+class MultiModalityDataset(ConcatDataset):
+    def __init__(self, datasets: List[ModalityDatasetItem]):
+        id_to_mode = []
+        dsets = []
+        max_tokens = []
+        max_sentences = []
+        max_positions = []
+        for dset in datasets:
+            id_to_mode.append(dset.datasetname)
+            dsets.append(dset.dataset)
+            max_tokens.append(dset.max_tokens)
+            max_positions.append(dset.max_positions)
+            max_sentences.append(dset.max_sentences)
+        weights = [1.0 for s in dsets]
+        super().__init__(dsets, weights)
+        self.max_tokens = max_tokens
+        self.max_positions = max_positions
+        self.max_sentences = max_sentences
+        self.id_to_mode = id_to_mode
+        self.raw_sub_batch_samplers = []
+        self._cur_epoch = 0
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        self._cur_epoch = epoch
+
+    def __getitem__(self, idx):
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        sample = self.datasets[dataset_idx][sample_idx]
+        return (dataset_idx, sample)
+
+    def collater(self, samples):
+        if len(samples) == 0:
+            return {}
+        dataset_idx = samples[0][0]
+        # make sure all samples in samples are from same dataset
+        assert sum([0 if dataset_idx == s[0] else 1 for s in samples]) == 0
+        samples = self.datasets[dataset_idx].collater([x[1] for x in samples])
+        # add mode
+        samples["net_input"]["mode"] = self.id_to_mode[dataset_idx]
+
+        return samples
+
+    def size(self, index: int):
+        if len(self.datasets) == 1:
+            return self.datasets[0].size(index)
+        return super().size(index)
+
+    @property
+    def sizes(self):
+        if len(self.datasets) == 1:
+            return self.datasets[0].sizes
+        return super().sizes
+
+    def ordered_indices(self):
+        """
+        Returns indices sorted by length. So less padding is needed.
+        """
+        if len(self.datasets) == 1:
+            return [self.datasets[0].ordered_indices()]
+        indices_group = []
+        for d_idx, ds in enumerate(self.datasets):
+            sample_num = self.cumulative_sizes[d_idx]
+            if d_idx > 0:
+                sample_num = sample_num - self.cumulative_sizes[d_idx - 1]
+            assert sample_num == len(ds)
+            indices_group.append(ds.ordered_indices())
+        return indices_group
+
+    def get_raw_batch_samplers(self, required_batch_size_multiple, seed):
+        if len(self.raw_sub_batch_samplers) > 0:
+            logger.info(" raw_sub_batch_samplers exists. No action is taken")
+            return
+        with data_utils.numpy_seed(seed):
+            indices = self.ordered_indices()
+
+        for i, ds in enumerate(self.datasets):
+            indices[i] = ds.filter_indices_by_size(
+                indices[i],
+                self.max_positions[i],
+            )[0]
+            sub_batch_sampler = ds.batch_by_size(
+                indices[i],
+                max_tokens=self.max_tokens[i],
+                max_sentences=self.max_sentences[i],
+                required_batch_size_multiple=required_batch_size_multiple,
+            )
+            self.raw_sub_batch_samplers.append(sub_batch_sampler)
+
+    def get_batch_samplers(self, mult_ratios, required_batch_size_multiple, seed):
+        self.get_raw_batch_samplers(required_batch_size_multiple, seed)
+        batch_samplers = []
+        for i, _ in enumerate(self.datasets):
+            if i > 0:
+                sub_batch_sampler = [
+                    [y + self.cumulative_sizes[i - 1] for y in x]
+                    for x in self.raw_sub_batch_samplers[i]
+                ]
+            else:
+                sub_batch_sampler = list(self.raw_sub_batch_samplers[i])
+            smp_r = mult_ratios[i]
+            if smp_r != 1:
+                is_increase = "increased" if smp_r > 1 else "decreased"
+                logger.info(
+                    "number of batch for the dataset {} is {} from {} to {}".format(
+                        self.id_to_mode[i],
+                        is_increase,
+                        len(sub_batch_sampler),
+                        int(len(sub_batch_sampler) * smp_r),
+                    )
+                )
+                mul_samplers = []
+                for _ in range(math.floor(smp_r)):
+                    mul_samplers = mul_samplers + sub_batch_sampler
+                if math.floor(smp_r) != smp_r:
+                    with data_utils.numpy_seed(seed + self._cur_epoch):
+                        np.random.shuffle(sub_batch_sampler)
+                        smp_num = int(
+                            (smp_r - math.floor(smp_r)) * len(sub_batch_sampler)
+                        )
+                    mul_samplers = mul_samplers + sub_batch_sampler[:smp_num]
+                sub_batch_sampler = mul_samplers
+            else:
+                logger.info(
+                    "dataset {} batch number is {} ".format(
+                        self.id_to_mode[i], len(sub_batch_sampler)
+                    )
+                )
+            batch_samplers.append(sub_batch_sampler)
+
+        return batch_samplers
+
+
+class LangPairMaskDataset(FairseqDataset):
+    def __init__(
+        self,
+        dataset: LanguagePairDataset,
+        src_eos: int,
+        src_bos: Optional[int] = None,
+        noise_id: Optional[int] = -1,
+        mask_ratio: Optional[float] = 0,
+        mask_type: Optional[str] = "random",
+    ):
+        self.dataset = dataset
+        self.src_eos = src_eos
+        self.src_bos = src_bos
+        self.noise_id = noise_id
+        self.mask_ratio = mask_ratio
+        self.mask_type = mask_type
+        assert mask_type in ("random", "tail")
+
+    @property
+    def src_sizes(self):
+        return self.dataset.src_sizes
+
+    @property
+    def tgt_sizes(self):
+        return self.dataset.tgt_sizes
+
+    @property
+    def sizes(self):
+        # dataset.sizes can be a dynamically computed sizes:
+        return self.dataset.sizes
+
+    def get_batch_shapes(self):
+        if hasattr(self.dataset, "get_batch_shapes"):
+            return self.dataset.get_batch_shapes()
+        return self.dataset.buckets
+
+    def num_tokens_vec(self, indices):
+        return self.dataset.num_tokens_vec(indices)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
+
+    def mask_src_tokens(self, sample):
+        src_item = sample["source"]
+        mask = None
+        if self.mask_type == "random":
+            mask = torch.rand(len(src_item)).le(self.mask_ratio)
+        else:
+            mask = torch.ones(len(src_item))
+            mask[: int(len(src_item) * (1 - self.mask_ratio))] = 0
+            mask = mask.eq(1)
+        if src_item[0] == self.src_bos:
+            mask[0] = False
+        if src_item[-1] == self.src_eos:
+            mask[-1] = False
+        mask_src_item = src_item.masked_fill(mask, self.noise_id)
+        smp = {"id": sample["id"], "source": mask_src_item, "target": sample["target"]}
+        return smp
+
+    def __getitem__(self, index):
+        sample = self.dataset[index]
+        if self.mask_ratio > 0:
+            sample = self.mask_src_tokens(sample)
+        return sample
+
+    def collater(self, samples, pad_to_length=None):
+        return self.dataset.collater(samples, pad_to_length)
+
+
+class FileAudioDatasetWrapper(FileAudioDataset):
+    def collater(self, samples):
+        samples = super().collater(samples)
+        if len(samples) == 0:
+            return {}
+        samples["net_input"]["src_tokens"] = samples["net_input"]["source"]
+        samples["net_input"]["prev_output_tokens"] = None
+        del samples["net_input"]["source"]
+        samples["net_input"]["src_lengths"] = None
+        samples["net_input"]["alignment"] = None
+        return samples
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
new file mode 100644
index 0000000000..edb307e68f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
@@ -0,0 +1,427 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+import os
+import sys
+import time
+import io
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .. import FairseqDataset
+from ..data_utils import compute_block_mask_1d, get_buckets, get_bucketed_sizes
+from fairseq.data.audio.audio_utils import (
+    parse_path,
+    read_from_stored_zip,
+    is_sf_audio_data,
+)
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+
+
+logger = logging.getLogger(__name__)
+
+
+class RawAudioDataset(FairseqDataset):
+    def __init__(
+        self,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=0,
+        shuffle=True,
+        pad=False,
+        normalize=False,
+        compute_mask=False,
+        feature_encoder_spec: str = "None",
+        mask_prob: float = 0.75,
+        mask_prob_adjust: float = 0,
+        mask_length: int = 1,
+        inverse_mask: bool = False,
+        require_same_masks: bool = True,
+        clone_batch: int = 1,
+        expand_adjacent: bool = False,
+        mask_dropout: float = 0,
+        non_overlapping: bool = False,
+    ):
+        super().__init__()
+
+        self.sample_rate = sample_rate
+        self.sizes = []
+        self.max_sample_size = (
+            max_sample_size if max_sample_size is not None else sys.maxsize
+        )
+        self.min_sample_size = min_sample_size
+        self.pad = pad
+        self.shuffle = shuffle
+        self.normalize = normalize
+
+        self.is_compute_mask = compute_mask
+        self.feature_encoder_spec = eval(feature_encoder_spec)
+        self._features_size_map = {}
+        self.mask_prob = mask_prob
+        self.mask_prob_adjust = mask_prob_adjust
+        self.mask_length = mask_length
+        self.inverse_mask = inverse_mask
+        self.require_same_masks = require_same_masks
+        self.clone_batch = clone_batch
+        self.expand_adjacent = expand_adjacent
+        self.mask_dropout = mask_dropout
+        self.non_overlapping = non_overlapping
+
+    def __getitem__(self, index):
+        raise NotImplementedError()
+
+    def __len__(self):
+        return len(self.sizes)
+
+    def postprocess(self, feats, curr_sample_rate):
+        if feats.dim() == 2:
+            feats = feats.mean(-1)
+
+        if curr_sample_rate != self.sample_rate:
+            raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}")
+
+        assert feats.dim() == 1, feats.dim()
+
+        if self.normalize:
+            with torch.no_grad():
+                feats = F.layer_norm(feats, feats.shape)
+        return feats
+
+    def crop_to_max_size(self, t, target_size, dim=0):
+        size = t.size(dim)
+        diff = size - target_size
+        if diff <= 0:
+            return t
+
+        start = np.random.randint(0, diff + 1)
+        end = size - diff + start
+
+        slices = []
+        for d in range(dim):
+            slices.append(slice(None))
+        slices.append(slice(start, end))
+
+        return t[slices]
+
+    @staticmethod
+    def _bucket_tensor(tensor, num_pad, value):
+        return F.pad(tensor, (0, num_pad), value=value)
+
+    def collater(self, samples):
+        samples = [s for s in samples if s["source"] is not None]
+        if len(samples) == 0:
+            return {}
+
+        sources = [s["source"] for s in samples]
+        sizes = [len(s) for s in sources]
+
+        if self.pad:
+            target_size = min(max(sizes), self.max_sample_size)
+        else:
+            target_size = min(min(sizes), self.max_sample_size)
+
+        collated_sources = sources[0].new_zeros(len(sources), target_size)
+        padding_mask = (
+            torch.BoolTensor(collated_sources.shape).fill_(False) if self.pad else None
+        )
+        for i, (source, size) in enumerate(zip(sources, sizes)):
+            diff = size - target_size
+            if diff == 0:
+                collated_sources[i] = source
+            elif diff < 0:
+                assert self.pad
+                collated_sources[i] = torch.cat(
+                    [source, source.new_full((-diff,), 0.0)]
+                )
+                padding_mask[i, diff:] = True
+            else:
+                collated_sources[i] = self.crop_to_max_size(source, target_size)
+
+        input = {"source": collated_sources}
+        out = {"id": torch.LongTensor([s["id"] for s in samples])}
+        if self.pad:
+            input["padding_mask"] = padding_mask
+
+        if hasattr(self, "num_buckets") and self.num_buckets > 0:
+            assert self.pad, "Cannot bucket without padding first."
+            bucket = max(self._bucketed_sizes[s["id"]] for s in samples)
+            num_pad = bucket - collated_sources.size(-1)
+            if num_pad:
+                input["source"] = self._bucket_tensor(collated_sources, num_pad, 0)
+                input["padding_mask"] = self._bucket_tensor(padding_mask, num_pad, True)
+
+        if "precomputed_mask" in samples[0]:
+            target_size = self._get_mask_indices_dims(target_size)
+            collated_mask = torch.cat(
+                [
+                    self.crop_to_max_size(s["precomputed_mask"], target_size, dim=1)
+                    for s in samples
+                ],
+                dim=0,
+            )
+            input["precomputed_mask"] = collated_mask
+
+        out["net_input"] = input
+        return out
+
+    def _get_mask_indices_dims(self, size, padding=0, dilation=1):
+        if size not in self.feature_encoder_spec:
+            L_in = size
+            for (_, kernel_size, stride) in self.feature_encoder_spec:
+                L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+                L_out = 1 + L_out // stride
+                L_in = L_out
+            self._features_size_map[size] = L_out
+        return self._features_size_map[size]
+
+    def num_tokens(self, index):
+        return self.size(index)
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        if self.pad:
+            return self.sizes[index]
+        return min(self.sizes[index], self.max_sample_size)
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+            order.append(
+                np.minimum(
+                    np.array(self.sizes),
+                    self.max_sample_size,
+                )
+            )
+            return np.lexsort(order)[::-1]
+        else:
+            return np.arange(len(self))
+
+    def set_bucket_info(self, num_buckets):
+        self.num_buckets = num_buckets
+        if self.num_buckets > 0:
+            self._collated_sizes = np.minimum(
+                np.array(self.sizes),
+                self.max_sample_size,
+            )
+            self.buckets = get_buckets(
+                self._collated_sizes,
+                self.num_buckets,
+            )
+            self._bucketed_sizes = get_bucketed_sizes(
+                self._collated_sizes, self.buckets
+            )
+            logger.info(
+                f"{len(self.buckets)} bucket(s) for the audio dataset: "
+                f"{self.buckets}"
+            )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        return indices, []
+
+
+class FileAudioDataset(RawAudioDataset):
+    def __init__(
+        self,
+        manifest_path,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=0,
+        shuffle=True,
+        pad=False,
+        normalize=False,
+        num_buckets=0,
+        compute_mask=False,
+        text_compression_level=TextCompressionLevel.none,
+        **mask_compute_kwargs,
+    ):
+        super().__init__(
+            sample_rate=sample_rate,
+            max_sample_size=max_sample_size,
+            min_sample_size=min_sample_size,
+            shuffle=shuffle,
+            pad=pad,
+            normalize=normalize,
+            compute_mask=compute_mask,
+            **mask_compute_kwargs,
+        )
+
+        self.text_compressor = TextCompressor(level=text_compression_level)
+
+        skipped = 0
+        self.fnames = []
+        sizes = []
+        self.skipped_indices = set()
+
+        with open(manifest_path, "r") as f:
+            self.root_dir = f.readline().strip()
+            for i, line in enumerate(f):
+                items = line.strip().split("\t")
+                assert len(items) == 2, line
+                sz = int(items[1])
+                if min_sample_size is not None and sz < min_sample_size:
+                    skipped += 1
+                    self.skipped_indices.add(i)
+                    continue
+                self.fnames.append(self.text_compressor.compress(items[0]))
+                sizes.append(sz)
+        logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples")
+
+        self.sizes = np.array(sizes, dtype=np.int64)
+
+        try:
+            import pyarrow
+
+            self.fnames = pyarrow.array(self.fnames)
+        except:
+            logger.debug(
+                "Could not create a pyarrow array. Please install pyarrow for better performance"
+            )
+            pass
+
+        self.set_bucket_info(num_buckets)
+
+    def __getitem__(self, index):
+        import soundfile as sf
+
+        fn = self.fnames[index]
+        fn = fn if isinstance(self.fnames, list) else fn.as_py()
+        fn = self.text_compressor.decompress(fn)
+        path_or_fp = os.path.join(self.root_dir, fn)
+        _path, slice_ptr = parse_path(path_or_fp)
+        if len(slice_ptr) == 2:
+            byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
+            assert is_sf_audio_data(byte_data)
+            path_or_fp = io.BytesIO(byte_data)
+
+        retry = 3
+        wav = None
+        for i in range(retry):
+            try:
+                wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32")
+                break
+            except Exception as e:
+                logger.warning(
+                    f"Failed to read {path_or_fp}: {e}. Sleeping for {1 * i}"
+                )
+                time.sleep(1 * i)
+
+        if wav is None:
+            raise Exception(f"Failed to load {path_or_fp}")
+
+        feats = torch.from_numpy(wav).float()
+        feats = self.postprocess(feats, curr_sample_rate)
+
+        v = {"id": index, "source": feats}
+
+        if self.is_compute_mask:
+            T = self._get_mask_indices_dims(feats.size(-1))
+            mask = compute_block_mask_1d(
+                shape=(self.clone_batch, T),
+                mask_prob=self.mask_prob,
+                mask_length=self.mask_length,
+                mask_prob_adjust=self.mask_prob_adjust,
+                inverse_mask=self.inverse_mask,
+                require_same_masks=True,
+                expand_adjcent=self.expand_adjacent,
+                mask_dropout=self.mask_dropout,
+                non_overlapping=self.non_overlapping,
+            )
+
+            v["precomputed_mask"] = mask
+
+        return v
+
+
+class BinarizedAudioDataset(RawAudioDataset):
+    def __init__(
+        self,
+        data_dir,
+        split,
+        sample_rate,
+        max_sample_size=None,
+        min_sample_size=0,
+        shuffle=True,
+        pad=False,
+        normalize=False,
+        num_buckets=0,
+        compute_mask=False,
+        **mask_compute_kwargs,
+    ):
+        super().__init__(
+            sample_rate=sample_rate,
+            max_sample_size=max_sample_size,
+            min_sample_size=min_sample_size,
+            shuffle=shuffle,
+            pad=pad,
+            normalize=normalize,
+            compute_mask=compute_mask,
+            **mask_compute_kwargs,
+        )
+
+        from fairseq.data import data_utils, Dictionary
+
+        self.fnames_dict = Dictionary.load(os.path.join(data_dir, "dict.txt"))
+
+        root_path = os.path.join(data_dir, f"{split}.root")
+        if os.path.exists(root_path):
+            with open(root_path, "r") as f:
+                self.root_dir = next(f).strip()
+        else:
+            self.root_dir = None
+
+        fnames_path = os.path.join(data_dir, split)
+        self.fnames = data_utils.load_indexed_dataset(fnames_path, self.fnames_dict)
+        lengths_path = os.path.join(data_dir, f"{split}.lengths")
+
+        with open(lengths_path, "r") as f:
+            for line in f:
+                sz = int(line.rstrip())
+                assert (
+                    sz >= min_sample_size
+                ), f"Min sample size is not supported for binarized dataset, but found a sample with size {sz}"
+                self.sizes.append(sz)
+
+        self.sizes = np.array(self.sizes, dtype=np.int64)
+
+        self.set_bucket_info(num_buckets)
+        logger.info(f"loaded {len(self.fnames)} samples")
+
+    def __getitem__(self, index):
+        import soundfile as sf
+
+        fname = self.fnames_dict.string(self.fnames[index], separator="")
+        if self.root_dir:
+            fname = os.path.join(self.root_dir, fname)
+
+        wav, curr_sample_rate = sf.read(fname)
+        feats = torch.from_numpy(wav).float()
+        feats = self.postprocess(feats, curr_sample_rate)
+        v = {"id": index, "source": feats}
+
+        if self.is_compute_mask:
+            T = self._get_mask_indices_dims(feats.size(-1))
+            mask = compute_block_mask_1d(
+                shape=(self.clone_batch, T),
+                mask_prob=self.mask_prob,
+                mask_length=self.mask_length,
+                mask_prob_adjust=self.mask_prob_adjust,
+                inverse_mask=self.inverse_mask,
+                require_same_masks=True,
+                expand_adjcent=self.expand_adjacent,
+                mask_dropout=self.mask_dropout,
+                non_overlapping=self.non_overlapping,
+            )
+
+            v["precomputed_mask"] = mask
+
+        return v
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
new file mode 100644
index 0000000000..fe4b61f831
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
@@ -0,0 +1,379 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from fairseq.data import ConcatDataset, Dictionary
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.data_cfg import S2SDataConfig
+from fairseq.data.audio.speech_to_text_dataset import (
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+    TextTargetMultitaskData,
+    _collate_frames,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SpeechToSpeechDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    target_speaker: Optional[torch.Tensor] = None
+    tgt_lang_tag: Optional[int] = None
+
+
+class SpeechToSpeechDataset(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        data_cfg: S2SDataConfig,
+        src_audio_paths: List[str],
+        src_n_frames: List[int],
+        tgt_audio_paths: List[str],
+        tgt_n_frames: List[int],
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        target_is_code: bool = False,
+        tgt_dict: Dictionary = None,
+        n_frames_per_step: int = 1,
+    ):
+        tgt_texts = tgt_audio_paths if target_is_code else None
+        super().__init__(
+            split=split,
+            is_train_split=is_train_split,
+            cfg=data_cfg,
+            audio_paths=src_audio_paths,
+            n_frames=src_n_frames,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            tgt_texts=tgt_texts,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            n_frames_per_step=n_frames_per_step,
+        )
+
+        self.tgt_audio_paths = tgt_audio_paths
+        self.tgt_lens = [t // self.n_frames_per_step for t in tgt_n_frames]
+
+        assert not target_is_code or tgt_dict is not None
+        self.target_is_code = target_is_code
+
+        assert len(tgt_audio_paths) == self.n_samples
+        assert len(tgt_n_frames) == self.n_samples
+
+        self.tgt_speakers = None
+        if self.cfg.target_speaker_embed:
+            samples = SpeechToTextDatasetCreator._load_samples_from_tsv(
+                self.cfg.target_speaker_embed, split
+            )
+            spk_emb_dict = {s["id"]: s["speaker_embed"] for s in samples}
+            self.tgt_speakers = [spk_emb_dict[id] for id in self.ids]
+            assert len(self.tgt_speakers) == self.n_samples
+
+        logger.info(self.__repr__())
+
+    def pack_units(self, input: torch.Tensor) -> torch.Tensor:
+        if self.n_frames_per_step <= 1:
+            return input
+
+        offset = 4
+        vocab_size = (
+            len(self.tgt_dict) - offset
+        )  # remove offset from <bos>, <pad>, <eos>, <unk>, which is specific to fairseq dictionary
+
+        assert input.dim() == 1
+        stacked_input = (
+            input[:-1].view(-1, self.n_frames_per_step) - offset
+        )  # remove <eos>
+        scale = [
+            pow(vocab_size, self.n_frames_per_step - 1 - i)
+            for i in range(self.n_frames_per_step)
+        ]
+        scale = torch.LongTensor(scale).squeeze(0)
+        res = input.new((len(input) - 1) // self.n_frames_per_step + 1).fill_(input[-1])
+        res[:-1] = (stacked_input * scale).sum(dim=1) + offset
+
+        return res
+
+    def __getitem__(self, index: int) -> SpeechToSpeechDatasetItem:
+        source = self._get_source_audio(index)
+
+        tgt_lang_tag = None
+        if self.cfg.prepend_tgt_lang_tag_as_bos:
+            # prepend_tgt_lang_tag_as_bos: put tgt_lang_tag as bos of target
+            tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+
+        if not self.target_is_code:
+            target = get_features_or_waveform(self.tgt_audio_paths[index])
+            target = torch.from_numpy(target).float()
+            target = self.pack_frames(target)
+        else:
+            target = self.tgt_dict.encode_line(
+                self.tgt_audio_paths[index],
+                add_if_not_exist=False,
+                append_eos=True,
+            ).long()
+            if self.n_frames_per_step > 1:
+                n_tgt_frame = target.size(0) - 1  # exclude <eos>
+                keep_n_tgt_frame = n_tgt_frame - n_tgt_frame % self.n_frames_per_step
+                target = torch.cat(
+                    (
+                        target[:keep_n_tgt_frame],
+                        target.new_full((1,), self.tgt_dict.eos()),
+                    ),
+                    dim=0,
+                )
+
+        if self.tgt_speakers:
+            tgt_spk = get_features_or_waveform(self.tgt_speakers[index])
+            tgt_spk = torch.from_numpy(tgt_spk).float()
+        else:
+            tgt_spk = torch.FloatTensor([])
+
+        return SpeechToSpeechDatasetItem(
+            index=index,
+            source=source,
+            target=target,
+            target_speaker=tgt_spk,
+            tgt_lang_tag=tgt_lang_tag,
+        )
+
+    def _collate_target(self, samples: List[SpeechToSpeechDatasetItem]) -> torch.Tensor:
+        if self.target_is_code:
+            target = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            # convert stacked units to a single id
+            pack_targets = [self.pack_units(x.target) for x in samples]
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                pack_targets,
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=True,
+            )
+            target_lengths = torch.tensor(
+                [x.size(0) for x in pack_targets], dtype=torch.long
+            )
+        else:
+            target = _collate_frames([x.target for x in samples], is_audio_input=False)
+            bsz, _, d = target.size()
+            prev_output_tokens = torch.cat(
+                (target.new_full((bsz, 1, d), 0.0), target[:, :-1, :]), dim=1
+            )
+            target_lengths = torch.tensor(
+                [x.target.size(0) for x in samples], dtype=torch.long
+            )
+
+        return target, prev_output_tokens, target_lengths
+
+    def collater(
+        self, samples: List[SpeechToSpeechDatasetItem], return_order: bool = False
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        indices = torch.tensor([x.index for x in samples], dtype=torch.long)
+        frames = _collate_frames([x.source for x in samples], self.cfg.use_audio_input)
+        # sort samples by descending number of frames
+        n_frames = torch.tensor([x.source.size(0) for x in samples], dtype=torch.long)
+        n_frames, order = n_frames.sort(descending=True)
+        indices = indices.index_select(0, order)
+        frames = frames.index_select(0, order)
+
+        target, prev_output_tokens, target_lengths = self._collate_target(samples)
+        target = target.index_select(0, order)
+        target_lengths = target_lengths.index_select(0, order)
+        prev_output_tokens = prev_output_tokens.index_select(0, order)
+        ntokens = sum(x.target.size(0) for x in samples)
+
+        tgt_speakers = None
+        if self.cfg.target_speaker_embed:
+            tgt_speakers = _collate_frames(
+                [x.target_speaker for x in samples], is_audio_input=True
+            ).index_select(0, order)
+
+        net_input = {
+            "src_tokens": frames,
+            "src_lengths": n_frames,
+            "prev_output_tokens": prev_output_tokens,
+            "tgt_speaker": tgt_speakers,  # TODO: unify "speaker" and "tgt_speaker"
+        }
+        if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None:
+            for i in range(len(samples)):
+                net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag
+        out = {
+            "id": indices,
+            "net_input": net_input,
+            "speaker": tgt_speakers,  # to support Tacotron2 loss for speech-to-spectrogram model
+            "target": target,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+            "nsentences": len(samples),
+        }
+        if return_order:
+            out["order"] = order
+        return out
+
+
+class SpeechToSpeechMultitaskDataset(SpeechToSpeechDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.multitask_data = {}
+
+    def add_multitask_dataset(self, task_name, task_data):
+        self.multitask_data[task_name] = task_data
+
+    def __getitem__(
+        self, index: int
+    ) -> Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]:
+        s2s_data = super().__getitem__(index)
+
+        multitask_target = {}
+        sample_id = self.ids[index]
+        tgt_lang = self.tgt_langs[index]
+        for task_name, task_dataset in self.multitask_data.items():
+            multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang)
+
+        return s2s_data, multitask_target
+
+    def collater(
+        self, samples: List[Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]]
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+
+        out = super().collater([s for s, _ in samples], return_order=True)
+        order = out["order"]
+        del out["order"]
+
+        for task_name, task_dataset in self.multitask_data.items():
+            if "multitask" not in out:
+                out["multitask"] = {}
+            d = [s[task_name] for _, s in samples]
+            task_target = task_dataset.collater(d)
+            out["multitask"][task_name] = {
+                "target": task_target["target"].index_select(0, order),
+                "target_lengths": task_target["target_lengths"].index_select(0, order),
+                "ntokens": task_target["ntokens"],
+            }
+            out["multitask"][task_name]["net_input"] = {
+                "prev_output_tokens": task_target["prev_output_tokens"].index_select(
+                    0, order
+                ),
+            }
+
+        return out
+
+
+class SpeechToSpeechDatasetCreator(object):
+    # mandatory columns
+    KEY_ID, KEY_SRC_AUDIO, KEY_SRC_N_FRAMES = "id", "src_audio", "src_n_frames"
+    KEY_TGT_AUDIO, KEY_TGT_N_FRAMES = "tgt_audio", "tgt_n_frames"
+    # optional columns
+    KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang"
+    # default values
+    DEFAULT_LANG = ""
+
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        data_cfg: S2SDataConfig,
+        target_is_code: bool = False,
+        tgt_dict: Dictionary = None,
+        n_frames_per_step: int = 1,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToSpeechDataset:
+        audio_root = Path(data_cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        src_audio_paths = [
+            (audio_root / s[cls.KEY_SRC_AUDIO]).as_posix() for s in samples
+        ]
+        tgt_audio_paths = [
+            s[cls.KEY_TGT_AUDIO]
+            if target_is_code
+            else (audio_root / s[cls.KEY_TGT_AUDIO]).as_posix()
+            for s in samples
+        ]
+        src_n_frames = [int(s[cls.KEY_SRC_N_FRAMES]) for s in samples]
+        tgt_n_frames = [int(s[cls.KEY_TGT_N_FRAMES]) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+
+        has_multitask = multitask is not None and len(multitask.keys()) > 0
+        dataset_cls = (
+            SpeechToSpeechMultitaskDataset if has_multitask else SpeechToSpeechDataset
+        )
+
+        ds = dataset_cls(
+            split=split_name,
+            is_train_split=is_train_split,
+            data_cfg=data_cfg,
+            src_audio_paths=src_audio_paths,
+            src_n_frames=src_n_frames,
+            tgt_audio_paths=tgt_audio_paths,
+            tgt_n_frames=tgt_n_frames,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            target_is_code=target_is_code,
+            tgt_dict=tgt_dict,
+            n_frames_per_step=n_frames_per_step,
+        )
+
+        if has_multitask:
+            for task_name, task_obj in multitask.items():
+                task_data = TextTargetMultitaskData(
+                    task_obj.args, split_name, task_obj.target_dictionary
+                )
+                ds.add_multitask_dataset(task_name, task_data)
+        return ds
+
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        data_cfg: S2SDataConfig,
+        splits: str,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        target_is_code: bool = False,
+        tgt_dict: Dictionary = None,
+        n_frames_per_step: int = 1,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToSpeechDataset:
+        datasets = []
+        for split in splits.split(","):
+            samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split)
+            ds = cls._from_list(
+                split_name=split,
+                is_train_split=is_train_split,
+                samples=samples,
+                data_cfg=data_cfg,
+                target_is_code=target_is_code,
+                tgt_dict=tgt_dict,
+                n_frames_per_step=n_frames_per_step,
+                multitask=multitask,
+            )
+            datasets.append(ds)
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
new file mode 100644
index 0000000000..cdf71558fd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
@@ -0,0 +1,733 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+import logging
+import re
+from argparse import Namespace
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from fairseq.data import ConcatDataset, Dictionary, FairseqDataset, ResamplingDataset
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data import encoders
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.data_cfg import S2TDataConfig
+from fairseq.data.audio.dataset_transforms import CompositeAudioDatasetTransform
+from fairseq.data.audio.dataset_transforms.concataugment import ConcatAugment
+from fairseq.data.audio.dataset_transforms.noisyoverlapaugment import (
+    NoisyOverlapAugment,
+)
+from fairseq.data.audio.feature_transforms import CompositeAudioFeatureTransform
+from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform
+
+logger = logging.getLogger(__name__)
+
+
+def _collate_frames(
+    frames: List[torch.Tensor], is_audio_input: bool = False
+) -> torch.Tensor:
+    """
+    Convert a list of 2D frames into a padded 3D tensor
+    Args:
+        frames (list): list of 2D frames of size L[i]*f_dim. Where L[i] is
+            length of i-th frame and f_dim is static dimension of features
+    Returns:
+        3D tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+    """
+    max_len = max(frame.size(0) for frame in frames)
+    if is_audio_input:
+        out = frames[0].new_zeros((len(frames), max_len))
+    else:
+        out = frames[0].new_zeros((len(frames), max_len, frames[0].size(1)))
+    for i, v in enumerate(frames):
+        out[i, : v.size(0)] = v
+    return out
+
+
+def _is_int_or_np_int(n):
+    return isinstance(n, int) or (
+        isinstance(n, np.generic) and isinstance(n.item(), int)
+    )
+
+
+@dataclass
+class SpeechToTextDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    speaker_id: Optional[int] = None
+
+
+class SpeechToTextDataset(FairseqDataset):
+    LANG_TAG_TEMPLATE = "<lang:{}>"
+
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        append_eos=True,
+    ):
+        self.split, self.is_train_split = split, is_train_split
+        self.cfg = cfg
+        self.audio_paths, self.n_frames = audio_paths, n_frames
+        self.n_samples = len(audio_paths)
+        assert len(n_frames) == self.n_samples > 0
+        assert src_texts is None or len(src_texts) == self.n_samples
+        assert tgt_texts is None or len(tgt_texts) == self.n_samples
+        assert speakers is None or len(speakers) == self.n_samples
+        assert src_langs is None or len(src_langs) == self.n_samples
+        assert tgt_langs is None or len(tgt_langs) == self.n_samples
+        assert ids is None or len(ids) == self.n_samples
+        assert (tgt_dict is None and tgt_texts is None) or (
+            tgt_dict is not None and tgt_texts is not None
+        )
+        self.src_texts, self.tgt_texts = src_texts, tgt_texts
+        self.src_langs, self.tgt_langs = src_langs, tgt_langs
+        self.speakers = speakers
+        self.tgt_dict = tgt_dict
+        self.check_tgt_lang_tag()
+        self.ids = ids
+        self.shuffle = cfg.shuffle if is_train_split else False
+
+        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
+            self.cfg.get_feature_transforms(split, is_train_split)
+        )
+        self.waveform_transforms = CompositeAudioWaveformTransform.from_config_dict(
+            self.cfg.get_waveform_transforms(split, is_train_split)
+        )
+        # TODO: add these to data_cfg.py
+        self.dataset_transforms = CompositeAudioDatasetTransform.from_config_dict(
+            self.cfg.get_dataset_transforms(split, is_train_split)
+        )
+
+        # check proper usage of transforms
+        if self.feature_transforms and self.cfg.use_audio_input:
+            logger.warning(
+                "Feature transforms will not be applied. To use feature transforms, "
+                "set use_audio_input as False in config."
+            )
+
+        self.pre_tokenizer = pre_tokenizer
+        self.bpe_tokenizer = bpe_tokenizer
+        self.n_frames_per_step = n_frames_per_step
+        self.speaker_to_id = speaker_to_id
+
+        self.tgt_lens = self.get_tgt_lens_and_check_oov()
+        self.append_eos = append_eos
+
+        logger.info(self.__repr__())
+
+    def get_tgt_lens_and_check_oov(self):
+        if self.tgt_texts is None:
+            return [0 for _ in range(self.n_samples)]
+        tgt_lens = []
+        n_tokens, n_oov_tokens = 0, 0
+        for i in range(self.n_samples):
+            tokenized = self.get_tokenized_tgt_text(i).split(" ")
+            oov_tokens = [
+                t
+                for t in tokenized
+                if self.tgt_dict.index(t) == self.tgt_dict.unk_index
+            ]
+            n_tokens += len(tokenized)
+            n_oov_tokens += len(oov_tokens)
+            tgt_lens.append(len(tokenized))
+        logger.info(f"'{self.split}' has {n_oov_tokens / n_tokens * 100:.2f}% OOV")
+        return tgt_lens
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + f'(split="{self.split}", n_samples={self.n_samples:_}, '
+            f"prepend_tgt_lang_tag={self.cfg.prepend_tgt_lang_tag}, "
+            f"n_frames_per_step={self.n_frames_per_step}, "
+            f"shuffle={self.shuffle}, "
+            f"feature_transforms={self.feature_transforms}, "
+            f"waveform_transforms={self.waveform_transforms}, "
+            f"dataset_transforms={self.dataset_transforms})"
+        )
+
+    @classmethod
+    def is_lang_tag(cls, token):
+        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
+        return re.match(pattern, token)
+
+    def check_tgt_lang_tag(self):
+        if self.cfg.prepend_tgt_lang_tag:
+            assert self.tgt_langs is not None and self.tgt_dict is not None
+            tgt_lang_tags = [
+                self.LANG_TAG_TEMPLATE.format(t) for t in set(self.tgt_langs)
+            ]
+            assert all(t in self.tgt_dict for t in tgt_lang_tags)
+
+    @classmethod
+    def tokenize(cls, tokenizer, text: str):
+        return text if tokenizer is None else tokenizer.encode(text)
+
+    def get_tokenized_tgt_text(self, index: Union[int, List[int]]):
+        if _is_int_or_np_int(index):
+            text = self.tgt_texts[index]
+        else:
+            text = " ".join([self.tgt_texts[i] for i in index])
+
+        text = self.tokenize(self.pre_tokenizer, text)
+        text = self.tokenize(self.bpe_tokenizer, text)
+        return text
+
+    def pack_frames(self, feature: torch.Tensor):
+        if self.n_frames_per_step == 1:
+            return feature
+        n_packed_frames = feature.shape[0] // self.n_frames_per_step
+        feature = feature[: self.n_frames_per_step * n_packed_frames]
+        return feature.reshape(n_packed_frames, -1)
+
+    @classmethod
+    def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
+        lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
+        assert lang_tag_idx != dictionary.unk()
+        return lang_tag_idx
+
+    def _get_source_audio(self, index: Union[int, List[int]]) -> torch.Tensor:
+        """
+        Gives source audio for given index with any relevant transforms
+        applied. For ConcatAug, source audios for given indices are
+        concatenated in given order.
+        Args:
+            index (int or List[int]): index—or in the case of ConcatAug,
+            indices—to pull the source audio for
+        Returns:
+            source audios concatenated for given indices with
+            relevant transforms appplied
+        """
+        if _is_int_or_np_int(index):
+            source = get_features_or_waveform(
+                self.audio_paths[index],
+                need_waveform=self.cfg.use_audio_input,
+                use_sample_rate=self.cfg.use_sample_rate,
+                waveform_transforms=self.waveform_transforms,
+            )
+        else:
+            source = np.concatenate(
+                [
+                    get_features_or_waveform(
+                        self.audio_paths[i],
+                        need_waveform=self.cfg.use_audio_input,
+                        use_sample_rate=self.cfg.use_sample_rate,
+                        waveform_transforms=self.waveform_transforms,
+                    )
+                    for i in index
+                ]
+            )
+        if self.cfg.use_audio_input:
+            source = torch.from_numpy(source).float()
+            if self.cfg.standardize_audio:
+                with torch.no_grad():
+                    source = F.layer_norm(source, source.shape)
+        else:
+            if self.feature_transforms is not None:
+                source = self.feature_transforms(source)
+            source = torch.from_numpy(source).float()
+        return source
+
+    def __getitem__(self, index: int) -> SpeechToTextDatasetItem:
+        has_concat = self.dataset_transforms.has_transform(ConcatAugment)
+        if has_concat:
+            concat = self.dataset_transforms.get_transform(ConcatAugment)
+            indices = concat.find_indices(index, self.n_frames, self.n_samples)
+
+        source = self._get_source_audio(indices if has_concat else index)
+        source = self.pack_frames(source)
+
+        target = None
+        if self.tgt_texts is not None:
+            tokenized = self.get_tokenized_tgt_text(indices if has_concat else index)
+            target = self.tgt_dict.encode_line(
+                tokenized, add_if_not_exist=False, append_eos=self.append_eos
+            ).long()
+            if self.cfg.prepend_tgt_lang_tag:
+                lang_tag_idx = self.get_lang_tag_idx(
+                    self.tgt_langs[index], self.tgt_dict
+                )
+                target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0)
+
+        if self.cfg.prepend_bos_and_append_tgt_lang_tag:
+            bos = torch.LongTensor([self.tgt_dict.bos()])
+            lang_tag_idx = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+            assert lang_tag_idx != self.tgt_dict.unk()
+            lang_tag_idx = torch.LongTensor([lang_tag_idx])
+            target = torch.cat((bos, target, lang_tag_idx), 0)
+
+        speaker_id = None
+        if self.speaker_to_id is not None:
+            speaker_id = self.speaker_to_id[self.speakers[index]]
+        return SpeechToTextDatasetItem(
+            index=index, source=source, target=target, speaker_id=speaker_id
+        )
+
+    def __len__(self):
+        return self.n_samples
+
+    def collater(
+        self, samples: List[SpeechToTextDatasetItem], return_order: bool = False
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+        indices = torch.tensor([x.index for x in samples], dtype=torch.long)
+
+        sources = [x.source for x in samples]
+        has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment)
+        if has_NOAug and self.cfg.use_audio_input:
+            NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment)
+            sources = NOAug(sources)
+
+        frames = _collate_frames(sources, self.cfg.use_audio_input)
+        # sort samples by descending number of frames
+        n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long)
+        n_frames, order = n_frames.sort(descending=True)
+        indices = indices.index_select(0, order)
+        frames = frames.index_select(0, order)
+
+        target, target_lengths = None, None
+        prev_output_tokens = None
+        ntokens = None
+        if self.tgt_texts is not None:
+            target = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                self.tgt_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, order)
+            target_lengths = torch.tensor(
+                [x.target.size(0) for x in samples], dtype=torch.long
+            ).index_select(0, order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [x.target for x in samples],
+                self.tgt_dict.pad(),
+                eos_idx=None,
+                left_pad=False,
+                move_eos_to_beginning=True,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, order)
+            ntokens = sum(x.target.size(0) for x in samples)
+
+        speaker = None
+        if self.speaker_to_id is not None:
+            speaker = (
+                torch.tensor([s.speaker_id for s in samples], dtype=torch.long)
+                .index_select(0, order)
+                .view(-1, 1)
+            )
+
+        net_input = {
+            "src_tokens": frames,
+            "src_lengths": n_frames,
+            "prev_output_tokens": prev_output_tokens,
+        }
+        out = {
+            "id": indices,
+            "net_input": net_input,
+            "speaker": speaker,
+            "target": target,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+            "nsentences": len(samples),
+        }
+        if return_order:
+            out["order"] = order
+        return out
+
+    def num_tokens(self, index):
+        return self.n_frames[index]
+
+    def size(self, index):
+        return self.n_frames[index], self.tgt_lens[index]
+
+    @property
+    def sizes(self):
+        return np.array(self.n_frames)
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True
+
+    def ordered_indices(self):
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        # first by descending order of # of frames then by original/random order
+        order.append([-n for n in self.n_frames])
+        return np.lexsort(order)
+
+    def prefetch(self, indices):
+        raise False
+
+
+class TextTargetMultitaskData(object):
+    # mandatory columns
+    KEY_ID, KEY_TEXT = "id", "tgt_text"
+    LANG_TAG_TEMPLATE = "<lang:{}>"
+
+    def __init__(self, args, split, tgt_dict):
+        samples = SpeechToTextDatasetCreator._load_samples_from_tsv(args.data, split)
+        self.data = {s[self.KEY_ID]: s[self.KEY_TEXT] for s in samples}
+        self.dict = tgt_dict
+        self.append_eos = args.decoder_type != "ctc"
+        self.pre_tokenizer = self.build_tokenizer(args)
+        self.bpe_tokenizer = self.build_bpe(args)
+        self.prepend_bos_and_append_tgt_lang_tag = (
+            args.prepend_bos_and_append_tgt_lang_tag
+        )
+        self.eos_token = args.eos_token
+        self.lang_tag_mapping = args.get_lang_tag_mapping
+
+    @classmethod
+    def is_lang_tag(cls, token):
+        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
+        return re.match(pattern, token)
+
+    @classmethod
+    def tokenize(cls, tokenizer, text: str):
+        return text if tokenizer is None else tokenizer.encode(text)
+
+    def get_tokenized_tgt_text(self, index: int):
+        text = self.tokenize(self.pre_tokenizer, self.data[index])
+        text = self.tokenize(self.bpe_tokenizer, text)
+        return text
+
+    def get_lang_tag_idx(self, lang: str, dictionary: Dictionary):
+        lang_tag = self.LANG_TAG_TEMPLATE.format(lang)
+        lang_tag = self.lang_tag_mapping.get(lang_tag, lang_tag)
+        lang_tag_idx = dictionary.index(lang_tag)
+        assert lang_tag_idx != dictionary.unk(), (lang, lang_tag)
+        return lang_tag_idx
+
+    def build_tokenizer(self, args):
+        pre_tokenizer = args.config.get("pre_tokenizer")
+        if pre_tokenizer is not None:
+            logger.info(f"pre-tokenizer: {pre_tokenizer}")
+            return encoders.build_tokenizer(Namespace(**pre_tokenizer))
+        else:
+            return None
+
+    def build_bpe(self, args):
+        bpe_tokenizer = args.config.get("bpe_tokenizer")
+        if bpe_tokenizer is not None:
+            logger.info(f"tokenizer: {bpe_tokenizer}")
+            return encoders.build_bpe(Namespace(**bpe_tokenizer))
+        else:
+            return None
+
+    def get(self, sample_id, tgt_lang=None):
+        if sample_id in self.data:
+            tokenized = self.get_tokenized_tgt_text(sample_id)
+            target = self.dict.encode_line(
+                tokenized,
+                add_if_not_exist=False,
+                append_eos=self.append_eos,
+            )
+            if self.prepend_bos_and_append_tgt_lang_tag:
+                bos = torch.LongTensor([self.dict.bos()])
+                lang_tag_idx = self.get_lang_tag_idx(tgt_lang, self.dict)
+                assert lang_tag_idx != self.dict.unk()
+                lang_tag_idx = torch.LongTensor([lang_tag_idx])
+                target = torch.cat((bos, target, lang_tag_idx), 0)
+            return target
+        else:
+            logger.warning(f"no target for {sample_id}")
+            return torch.IntTensor([])
+
+    def collater(self, samples: List[torch.Tensor]) -> torch.Tensor:
+        out = fairseq_data_utils.collate_tokens(
+            samples,
+            self.dict.pad(),
+            eos_idx=None,
+            left_pad=False,
+            move_eos_to_beginning=False,
+        ).long()
+
+        prev_out = fairseq_data_utils.collate_tokens(
+            samples,
+            self.dict.pad(),
+            eos_idx=None,
+            left_pad=False,
+            move_eos_to_beginning=True,
+        ).long()
+
+        target_lengths = torch.tensor([t.size(0) for t in samples], dtype=torch.long)
+        ntokens = sum(t.size(0) for t in samples)
+
+        output = {
+            "prev_output_tokens": prev_out,
+            "target": out,
+            "target_lengths": target_lengths,
+            "ntokens": ntokens,
+        }
+
+        return output
+
+
+class SpeechToTextMultitaskDataset(SpeechToTextDataset):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.multitask_data = {}
+
+    def add_multitask_dataset(self, task_name, task_data):
+        self.multitask_data[task_name] = task_data
+
+    def __getitem__(
+        self, index: int
+    ) -> Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]:
+        s2t_data = super().__getitem__(index)
+
+        multitask_target = {}
+        sample_id = self.ids[index]
+        tgt_lang = self.tgt_langs[index]
+        for task_name, task_dataset in self.multitask_data.items():
+            multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang)
+
+        return s2t_data, multitask_target
+
+    def collater(
+        self, samples: List[Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]]
+    ) -> Dict:
+        if len(samples) == 0:
+            return {}
+
+        out = super().collater([s for s, _ in samples], return_order=True)
+        order = out["order"]
+        del out["order"]
+
+        for task_name, task_dataset in self.multitask_data.items():
+            if "multitask" not in out:
+                out["multitask"] = {}
+            d = [s[task_name] for _, s in samples]
+            task_target = task_dataset.collater(d)
+            out["multitask"][task_name] = {
+                "target": task_target["target"].index_select(0, order),
+                "target_lengths": task_target["target_lengths"].index_select(0, order),
+                "ntokens": task_target["ntokens"],
+            }
+            out["multitask"][task_name]["net_input"] = {
+                "prev_output_tokens": task_target["prev_output_tokens"].index_select(
+                    0, order
+                ),
+            }
+
+        return out
+
+
+class SpeechToTextDatasetCreator(object):
+    # mandatory columns
+    KEY_ID, KEY_AUDIO, KEY_N_FRAMES = "id", "audio", "n_frames"
+    KEY_TGT_TEXT = "tgt_text"
+    # optional columns
+    KEY_SPEAKER, KEY_SRC_TEXT = "speaker", "src_text"
+    KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang"
+    # default values
+    DEFAULT_SPEAKER = DEFAULT_SRC_TEXT = DEFAULT_LANG = ""
+
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TDataConfig,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+
+        has_multitask = multitask is not None and len(multitask.keys()) > 0
+        dataset_cls = (
+            SpeechToTextMultitaskDataset if has_multitask else SpeechToTextDataset
+        )
+
+        ds = dataset_cls(
+            split=split_name,
+            is_train_split=is_train_split,
+            cfg=cfg,
+            audio_paths=audio_paths,
+            n_frames=n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+
+        if has_multitask:
+            for task_name, task_obj in multitask.items():
+                task_data = TextTargetMultitaskData(
+                    task_obj.args, split_name, task_obj.target_dictionary
+                )
+                ds.add_multitask_dataset(task_name, task_data)
+        return ds
+
+    @classmethod
+    def get_size_ratios(
+        cls, datasets: List[SpeechToTextDataset], alpha: float = 1.0
+    ) -> List[float]:
+        """Size ratios for temperature-based sampling
+        (https://arxiv.org/abs/1907.05019)"""
+
+        id_to_lp, lp_to_sz = {}, defaultdict(int)
+        for ds in datasets:
+            lang_pairs = {f"{s}->{t}" for s, t in zip(ds.src_langs, ds.tgt_langs)}
+            assert len(lang_pairs) == 1
+            lang_pair = list(lang_pairs)[0]
+            id_to_lp[ds.split] = lang_pair
+            lp_to_sz[lang_pair] += sum(ds.n_frames)
+
+        sz_sum = sum(v for v in lp_to_sz.values())
+        lp_to_prob = {k: v / sz_sum for k, v in lp_to_sz.items()}
+        lp_to_tgt_prob = {k: v**alpha for k, v in lp_to_prob.items()}
+        prob_sum = sum(v for v in lp_to_tgt_prob.values())
+        lp_to_tgt_prob = {k: v / prob_sum for k, v in lp_to_tgt_prob.items()}
+        lp_to_sz_ratio = {
+            k: (lp_to_tgt_prob[k] * sz_sum) / v for k, v in lp_to_sz.items()
+        }
+        size_ratio = [lp_to_sz_ratio[id_to_lp[ds.split]] for ds in datasets]
+
+        p_formatted = {
+            k: f"{lp_to_prob[k]:.3f}->{lp_to_tgt_prob[k]:.3f}" for k in lp_to_sz
+        }
+        logger.info(f"sampling probability balancing: {p_formatted}")
+        sr_formatted = {ds.split: f"{r:.3f}" for ds, r in zip(datasets, size_ratio)}
+        logger.info(f"balanced sampling size ratio: {sr_formatted}")
+        return size_ratio
+
+    @classmethod
+    def _load_samples_from_tsv(cls, root: str, split: str):
+        tsv_path = Path(root) / f"{split}.tsv"
+        if not tsv_path.is_file():
+            raise FileNotFoundError(f"Dataset not found: {tsv_path}")
+        with open(tsv_path) as f:
+            reader = csv.DictReader(
+                f,
+                delimiter="\t",
+                quotechar=None,
+                doublequote=False,
+                lineterminator="\n",
+                quoting=csv.QUOTE_NONE,
+            )
+            samples = [dict(e) for e in reader]
+        if len(samples) == 0:
+            raise ValueError(f"Empty manifest: {tsv_path}")
+        return samples
+
+    @classmethod
+    def _from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        split: str,
+        tgt_dict,
+        is_train_split: bool,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        samples = cls._load_samples_from_tsv(root, split)
+        return cls._from_list(
+            split,
+            is_train_split,
+            samples,
+            cfg,
+            tgt_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            n_frames_per_step,
+            speaker_to_id,
+            multitask,
+        )
+
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        cfg: S2TDataConfig,
+        splits: str,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        n_frames_per_step: int = 1,
+        speaker_to_id=None,
+        multitask: Optional[Dict] = None,
+    ) -> SpeechToTextDataset:
+        datasets = [
+            cls._from_tsv(
+                root=root,
+                cfg=cfg,
+                split=split,
+                tgt_dict=tgt_dict,
+                is_train_split=is_train_split,
+                pre_tokenizer=pre_tokenizer,
+                bpe_tokenizer=bpe_tokenizer,
+                n_frames_per_step=n_frames_per_step,
+                speaker_to_id=speaker_to_id,
+                multitask=multitask,
+            )
+            for split in splits.split(",")
+        ]
+
+        if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
+            # temperature-based sampling
+            size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
+            datasets = [
+                ResamplingDataset(
+                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
+                )
+                for r, d in zip(size_ratios, datasets)
+            ]
+
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
new file mode 100644
index 0000000000..06922ea083
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
@@ -0,0 +1,359 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from pathlib import Path
+from typing import Dict, List, NamedTuple, Optional
+
+import torch
+
+from fairseq.data import ConcatDataset, Dictionary, ResamplingDataset
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.audio.speech_to_text_dataset import (
+    S2TDataConfig,
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class S2TJointDataConfig(S2TDataConfig):
+    """Wrapper class for data config YAML"""
+
+    @property
+    def src_vocab_filename(self):
+        """fairseq vocabulary file under data root"""
+        return self.config.get("src_vocab_filename", "src_dict.txt")
+
+    @property
+    def src_pre_tokenizer(self) -> Dict:
+        """Pre-tokenizer to apply before subword tokenization. Returning
+        a dictionary with `tokenizer` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        return self.config.get("src_pre_tokenizer", {"tokenizer": None})
+
+    @property
+    def src_bpe_tokenizer(self) -> Dict:
+        """Subword tokenizer to apply on source text after pre-tokenization.
+        Returning a dictionary with `bpe` providing the tokenizer name and
+        the other items providing the tokenizer-specific arguments.
+        Tokenizers are defined in `fairseq.data.encoders.*`"""
+        return self.config.get("src_bpe_tokenizer", {"bpe": None})
+
+    @property
+    def prepend_tgt_lang_tag_no_change(self) -> bool:
+        """Prepend target lang ID token as the prev_output_tokens BOS (e.g. for
+        to-many multilingual setting). No change needed during inference.
+        This option is deprecated and replaced by prepend_tgt_lang_tag_as_bos.
+        """
+        value = self.config.get("prepend_tgt_lang_tag_no_change", None)
+        if value is None:
+            return self.config.get("prepend_tgt_lang_tag_as_bos", False)
+        return value
+
+    @property
+    def sampling_text_alpha(self):
+        """Hyper-parameter alpha = 1/T for temperature-based resampling. (text
+        input only) (alpha = 1 for no resampling)"""
+        return self.config.get("sampling_text_alpha", 1.0)
+
+
+class SpeechToTextJointDatasetItem(NamedTuple):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    src_txt_tokens: Optional[torch.Tensor] = None
+    tgt_lang_tag: Optional[int] = None
+    src_lang_tag: Optional[int] = None
+    tgt_alignment: Optional[torch.Tensor] = None
+
+
+# use_src_lang_id:
+#   0: don't use src_lang_id
+#   1: attach src_lang_id to the src_txt_tokens as eos
+class SpeechToTextJointDataset(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TJointDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        src_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        src_pre_tokenizer=None,
+        src_bpe_tokenizer=None,
+        append_eos: Optional[bool] = True,
+        alignment: Optional[List[str]] = None,
+        use_src_lang_id: Optional[int] = 0,
+    ):
+        super().__init__(
+            split,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            append_eos=append_eos,
+        )
+
+        self.src_dict = src_dict
+        self.src_pre_tokenizer = src_pre_tokenizer
+        self.src_bpe_tokenizer = src_bpe_tokenizer
+        self.alignment = None
+        self.use_src_lang_id = use_src_lang_id
+        if alignment is not None:
+            self.alignment = [
+                [float(s) for s in sample.split()] for sample in alignment
+            ]
+
+    def get_tokenized_src_text(self, index: int):
+        text = self.tokenize(self.src_pre_tokenizer, self.src_texts[index])
+        text = self.tokenize(self.src_bpe_tokenizer, text)
+        return text
+
+    def __getitem__(self, index: int) -> SpeechToTextJointDatasetItem:
+        s2t_dataset_item = super().__getitem__(index)
+        src_tokens = None
+        src_lang_tag = None
+        if self.src_texts is not None and self.src_dict is not None:
+            src_tokens = self.get_tokenized_src_text(index)
+            src_tokens = self.src_dict.encode_line(
+                src_tokens, add_if_not_exist=False, append_eos=True
+            ).long()
+            if self.use_src_lang_id > 0:
+                src_lang_tag = self.get_lang_tag_idx(
+                    self.src_langs[index], self.src_dict
+                )
+        tgt_lang_tag = None
+        if self.cfg.prepend_tgt_lang_tag_no_change:
+            # prepend_tgt_lang_tag_no_change: modify prev_output_tokens instead
+            tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict)
+        ali = None
+        if self.alignment is not None:
+            ali = torch.Tensor(self.alignment[index]).float()
+
+        return SpeechToTextJointDatasetItem(
+            index=index,
+            source=s2t_dataset_item.source,
+            target=s2t_dataset_item.target,
+            src_txt_tokens=src_tokens,
+            tgt_lang_tag=tgt_lang_tag,
+            src_lang_tag=src_lang_tag,
+            tgt_alignment=ali,
+        )
+
+    def __len__(self):
+        return self.n_samples
+
+    def collater(self, samples: List[SpeechToTextJointDatasetItem]) -> Dict:
+        s2t_out = super().collater(samples, return_order=True)
+        if s2t_out == {}:
+            return s2t_out
+        net_input, order = s2t_out["net_input"], s2t_out["order"]
+
+        if self.src_texts is not None and self.src_dict is not None:
+            src_txt_tokens = fairseq_data_utils.collate_tokens(
+                [x.src_txt_tokens for x in samples],
+                self.src_dict.pad(),
+                self.src_dict.eos(),
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            src_txt_lengths = torch.tensor(
+                [x.src_txt_tokens.size()[0] for x in samples], dtype=torch.long
+            )
+            if self.use_src_lang_id > 0:
+                src_lang_idxs = torch.tensor(
+                    [s.src_lang_tag for s in samples], dtype=src_txt_tokens.dtype
+                )
+                if self.use_src_lang_id == 1:  # replace eos with lang_id
+                    eos_idx = src_txt_lengths - 1
+                    src_txt_tokens.scatter_(
+                        1, eos_idx.view(-1, 1), src_lang_idxs.view(-1, 1)
+                    )
+                else:
+                    raise NotImplementedError("Implementation is required")
+
+            src_txt_tokens = src_txt_tokens.index_select(0, order)
+            src_txt_lengths = src_txt_lengths.index_select(0, order)
+            net_input["src_txt_tokens"] = src_txt_tokens
+            net_input["src_txt_lengths"] = src_txt_lengths
+
+        net_input["alignment"] = None
+        if self.alignment is not None:
+            max_len = max([s.tgt_alignment.size(0) for s in samples])
+            alignment = torch.ones(len(samples), max_len).float()
+            for i, s in enumerate(samples):
+                cur_len = s.tgt_alignment.size(0)
+                alignment[i][:cur_len].copy_(s.tgt_alignment)
+            net_input["alignment"] = alignment.index_select(0, order)
+
+        if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None:
+            for i in range(len(samples)):
+                net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag
+
+        out = {
+            "id": s2t_out["id"],
+            "net_input": net_input,
+            "target": s2t_out["target"],
+            "target_lengths": s2t_out["target_lengths"],
+            "ntokens": s2t_out["ntokens"],
+            "nsentences": len(samples),
+        }
+        return out
+
+
+class SpeechToTextJointDatasetCreator(SpeechToTextDatasetCreator):
+    KEY_ALIGN = "align"
+
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TJointDataConfig,
+        tgt_dict,
+        src_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        src_pre_tokenizer,
+        src_bpe_tokenizer,
+        append_eos,
+        use_src_lang_id,
+    ) -> SpeechToTextJointDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_alignment = None
+        if cls.KEY_ALIGN in samples[0].keys():
+            tgt_alignment = [s[cls.KEY_ALIGN] for s in samples]
+        return SpeechToTextJointDataset(
+            split_name,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            src_dict=src_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            src_pre_tokenizer=src_pre_tokenizer,
+            src_bpe_tokenizer=src_bpe_tokenizer,
+            append_eos=append_eos,
+            alignment=tgt_alignment,
+            use_src_lang_id=use_src_lang_id,
+        )
+
+    @classmethod
+    def _from_tsv(
+        cls,
+        root: str,
+        cfg: S2TJointDataConfig,
+        split: str,
+        tgt_dict,
+        src_dict,
+        is_train_split: bool,
+        pre_tokenizer,
+        bpe_tokenizer,
+        src_pre_tokenizer,
+        src_bpe_tokenizer,
+        append_eos: bool,
+        use_src_lang_id: int,
+    ) -> SpeechToTextJointDataset:
+        samples = cls._load_samples_from_tsv(root, split)
+        return cls._from_list(
+            split,
+            is_train_split,
+            samples,
+            cfg,
+            tgt_dict,
+            src_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            src_pre_tokenizer,
+            src_bpe_tokenizer,
+            append_eos,
+            use_src_lang_id,
+        )
+
+    @classmethod
+    def from_tsv(
+        cls,
+        root: str,
+        cfg: S2TJointDataConfig,
+        splits: str,
+        tgt_dict,
+        src_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        src_pre_tokenizer,
+        src_bpe_tokenizer,
+        is_train_split: bool,
+        epoch: int,
+        seed: int,
+        append_eos: Optional[bool] = True,
+        use_src_lang_id: Optional[int] = 0,
+    ) -> SpeechToTextJointDataset:
+        datasets = [
+            cls._from_tsv(
+                root,
+                cfg,
+                split,
+                tgt_dict,
+                src_dict,
+                is_train_split,
+                pre_tokenizer,
+                bpe_tokenizer,
+                src_pre_tokenizer,
+                src_bpe_tokenizer,
+                append_eos=append_eos,
+                use_src_lang_id=use_src_lang_id,
+            )
+            for split in splits.split(",")
+        ]
+
+        if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0:
+            # temperature-based sampling
+            size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha)
+            datasets = [
+                ResamplingDataset(
+                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
+                )
+                for r, d in zip(size_ratios, datasets)
+            ]
+
+        return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py
new file mode 100644
index 0000000000..13612b458b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.abs
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import torch
+
+from fairseq.data import Dictionary
+from fairseq.data import data_utils as fairseq_data_utils
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.speech_to_text_dataset import (
+    S2TDataConfig,
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+    _collate_frames,
+)
+
+
+@dataclass
+class TextToSpeechDatasetItem(object):
+    index: int
+    source: torch.Tensor
+    target: Optional[torch.Tensor] = None
+    speaker_id: Optional[int] = None
+    duration: Optional[torch.Tensor] = None
+    pitch: Optional[torch.Tensor] = None
+    energy: Optional[torch.Tensor] = None
+
+
+class TextToSpeechDataset(SpeechToTextDataset):
+    def __init__(
+        self,
+        split: str,
+        is_train_split: bool,
+        cfg: S2TDataConfig,
+        audio_paths: List[str],
+        n_frames: List[int],
+        src_texts: Optional[List[str]] = None,
+        tgt_texts: Optional[List[str]] = None,
+        speakers: Optional[List[str]] = None,
+        src_langs: Optional[List[str]] = None,
+        tgt_langs: Optional[List[str]] = None,
+        ids: Optional[List[str]] = None,
+        tgt_dict: Optional[Dictionary] = None,
+        pre_tokenizer=None,
+        bpe_tokenizer=None,
+        n_frames_per_step=1,
+        speaker_to_id=None,
+        durations: Optional[List[List[int]]] = None,
+        pitches: Optional[List[str]] = None,
+        energies: Optional[List[str]] = None,
+    ):
+        super(TextToSpeechDataset, self).__init__(
+            split,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts=src_texts,
+            tgt_texts=tgt_texts,
+            speakers=speakers,
+            src_langs=src_langs,
+            tgt_langs=tgt_langs,
+            ids=ids,
+            tgt_dict=tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            n_frames_per_step=n_frames_per_step,
+            speaker_to_id=speaker_to_id,
+        )
+        self.durations = durations
+        self.pitches = pitches
+        self.energies = energies
+
+    def __getitem__(self, index: int) -> TextToSpeechDatasetItem:
+        s2t_item = super().__getitem__(index)
+
+        duration, pitch, energy = None, None, None
+        if self.durations is not None:
+            duration = torch.tensor(
+                self.durations[index] + [0], dtype=torch.long  # pad 0 for EOS
+            )
+        if self.pitches is not None:
+            pitch = get_features_or_waveform(self.pitches[index])
+            pitch = torch.from_numpy(
+                np.concatenate((pitch, [0]))  # pad 0 for EOS
+            ).float()
+        if self.energies is not None:
+            energy = get_features_or_waveform(self.energies[index])
+            energy = torch.from_numpy(
+                np.concatenate((energy, [0]))  # pad 0 for EOS
+            ).float()
+        return TextToSpeechDatasetItem(
+            index=index,
+            source=s2t_item.source,
+            target=s2t_item.target,
+            speaker_id=s2t_item.speaker_id,
+            duration=duration,
+            pitch=pitch,
+            energy=energy,
+        )
+
+    def collater(self, samples: List[TextToSpeechDatasetItem]) -> Dict[str, Any]:
+        if len(samples) == 0:
+            return {}
+
+        src_lengths, order = torch.tensor(
+            [s.target.shape[0] for s in samples], dtype=torch.long
+        ).sort(descending=True)
+        id_ = torch.tensor([s.index for s in samples], dtype=torch.long).index_select(
+            0, order
+        )
+        feat = _collate_frames(
+            [s.source for s in samples], self.cfg.use_audio_input
+        ).index_select(0, order)
+        target_lengths = torch.tensor(
+            [s.source.shape[0] for s in samples], dtype=torch.long
+        ).index_select(0, order)
+
+        src_tokens = fairseq_data_utils.collate_tokens(
+            [s.target for s in samples],
+            self.tgt_dict.pad(),
+            self.tgt_dict.eos(),
+            left_pad=False,
+            move_eos_to_beginning=False,
+        ).index_select(0, order)
+
+        speaker = None
+        if self.speaker_to_id is not None:
+            speaker = (
+                torch.tensor([s.speaker_id for s in samples], dtype=torch.long)
+                .index_select(0, order)
+                .view(-1, 1)
+            )
+
+        bsz, _, d = feat.size()
+        prev_output_tokens = torch.cat(
+            (feat.new_zeros((bsz, 1, d)), feat[:, :-1, :]), dim=1
+        )
+
+        durations, pitches, energies = None, None, None
+        if self.durations is not None:
+            durations = fairseq_data_utils.collate_tokens(
+                [s.duration for s in samples], 0
+            ).index_select(0, order)
+            assert src_tokens.shape[1] == durations.shape[1]
+        if self.pitches is not None:
+            pitches = _collate_frames([s.pitch for s in samples], True)
+            pitches = pitches.index_select(0, order)
+            assert src_tokens.shape[1] == pitches.shape[1]
+        if self.energies is not None:
+            energies = _collate_frames([s.energy for s in samples], True)
+            energies = energies.index_select(0, order)
+            assert src_tokens.shape[1] == energies.shape[1]
+        src_texts = [self.tgt_dict.string(samples[i].target) for i in order]
+
+        return {
+            "id": id_,
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "prev_output_tokens": prev_output_tokens,
+            },
+            "speaker": speaker,
+            "target": feat,
+            "durations": durations,
+            "pitches": pitches,
+            "energies": energies,
+            "target_lengths": target_lengths,
+            "ntokens": sum(target_lengths).item(),
+            "nsentences": len(samples),
+            "src_texts": src_texts,
+        }
+
+
+class TextToSpeechDatasetCreator(SpeechToTextDatasetCreator):
+    KEY_DURATION = "duration"
+    KEY_PITCH = "pitch"
+    KEY_ENERGY = "energy"
+
+    @classmethod
+    def _from_list(
+        cls,
+        split_name: str,
+        is_train_split,
+        samples: List[Dict],
+        cfg: S2TDataConfig,
+        tgt_dict,
+        pre_tokenizer,
+        bpe_tokenizer,
+        n_frames_per_step,
+        speaker_to_id,
+        multitask=None,
+    ) -> TextToSpeechDataset:
+        audio_root = Path(cfg.audio_root)
+        ids = [s[cls.KEY_ID] for s in samples]
+        audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples]
+        n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples]
+        tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples]
+        src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples]
+        speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples]
+        src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples]
+        tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples]
+
+        durations = [s.get(cls.KEY_DURATION, None) for s in samples]
+        durations = [
+            None if dd is None else [int(d) for d in dd.split(" ")] for dd in durations
+        ]
+        durations = None if any(dd is None for dd in durations) else durations
+
+        pitches = [s.get(cls.KEY_PITCH, None) for s in samples]
+        pitches = [
+            None if pp is None else (audio_root / pp).as_posix() for pp in pitches
+        ]
+        pitches = None if any(pp is None for pp in pitches) else pitches
+
+        energies = [s.get(cls.KEY_ENERGY, None) for s in samples]
+        energies = [
+            None if ee is None else (audio_root / ee).as_posix() for ee in energies
+        ]
+        energies = None if any(ee is None for ee in energies) else energies
+
+        return TextToSpeechDataset(
+            split_name,
+            is_train_split,
+            cfg,
+            audio_paths,
+            n_frames,
+            src_texts,
+            tgt_texts,
+            speakers,
+            src_langs,
+            tgt_langs,
+            ids,
+            tgt_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            n_frames_per_step,
+            speaker_to_id,
+            durations,
+            pitches,
+            energies,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py
new file mode 100644
index 0000000000..57f8bb571b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py
@@ -0,0 +1,48 @@
+import os
+from fairseq.data.audio import (
+    AudioTransform,
+    CompositeAudioTransform,
+    import_transforms,
+    register_audio_transform,
+)
+
+
+class AudioWaveformTransform(AudioTransform):
+    pass
+
+
+AUDIO_WAVEFORM_TRANSFORM_REGISTRY = {}
+AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES = set()
+
+
+def get_audio_waveform_transform(name):
+    return AUDIO_WAVEFORM_TRANSFORM_REGISTRY[name]
+
+
+def register_audio_waveform_transform(name):
+    return register_audio_transform(
+        name,
+        AudioWaveformTransform,
+        AUDIO_WAVEFORM_TRANSFORM_REGISTRY,
+        AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES,
+    )
+
+
+import_transforms(os.path.dirname(__file__), "waveform")
+
+
+class CompositeAudioWaveformTransform(CompositeAudioTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        return super()._from_config_dict(
+            cls,
+            "waveform",
+            get_audio_waveform_transform,
+            CompositeAudioWaveformTransform,
+            config,
+        )
+
+    def __call__(self, x, sample_rate):
+        for t in self.transforms:
+            x, sample_rate = t(x, sample_rate)
+        return x, sample_rate
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py
new file mode 100644
index 0000000000..401ce30943
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py
@@ -0,0 +1,201 @@
+from pathlib import Path
+import numpy as np
+from math import ceil
+
+from fairseq.data.audio import rand_uniform
+from fairseq.data.audio.waveform_transforms import (
+    AudioWaveformTransform,
+    register_audio_waveform_transform,
+)
+
+SNR_MIN = 5.0
+SNR_MAX = 15.0
+RATE = 0.25
+
+NOISE_RATE = 1.0
+NOISE_LEN_MEAN = 0.2
+NOISE_LEN_STD = 0.05
+
+
+class NoiseAugmentTransform(AudioWaveformTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return cls(
+            _config.get("samples_path", None),
+            _config.get("snr_min", SNR_MIN),
+            _config.get("snr_max", SNR_MAX),
+            _config.get("rate", RATE),
+        )
+
+    def __init__(
+        self,
+        samples_path: str,
+        snr_min: float = SNR_MIN,
+        snr_max: float = SNR_MAX,
+        rate: float = RATE,
+    ):
+        # Sanity checks
+        assert (
+            samples_path
+        ), "need to provide path to audio samples for noise augmentation"
+        assert snr_max >= snr_min, f"empty signal-to-noise range ({snr_min}, {snr_max})"
+        assert rate >= 0 and rate <= 1, "rate should be a float between 0 to 1"
+
+        self.paths = list(Path(samples_path).glob("**/*.wav"))  # load music
+        self.n_samples = len(self.paths)
+        assert self.n_samples > 0, f"no audio files found in {samples_path}"
+
+        self.snr_min = snr_min
+        self.snr_max = snr_max
+        self.rate = rate
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + "("
+            + ", ".join(
+                [
+                    f"n_samples={self.n_samples}",
+                    f"snr={self.snr_min}-{self.snr_max}dB",
+                    f"rate={self.rate}",
+                ]
+            )
+            + ")"
+        )
+
+    def pick_sample(self, goal_shape, always_2d=False, use_sample_rate=None):
+        from fairseq.data.audio.audio_utils import get_waveform
+
+        path = self.paths[np.random.randint(0, self.n_samples)]
+        sample = get_waveform(
+            path, always_2d=always_2d, output_sample_rate=use_sample_rate
+        )[0]
+
+        # Check dimensions match, else silently skip adding noise to sample
+        # NOTE: SHOULD THIS QUIT WITH AN ERROR?
+        is_2d = len(goal_shape) == 2
+        if len(goal_shape) != sample.ndim or (
+            is_2d and goal_shape[0] != sample.shape[0]
+        ):
+            return np.zeros(goal_shape)
+
+        # Cut/repeat sample to size
+        len_dim = len(goal_shape) - 1
+        n_repeat = ceil(goal_shape[len_dim] / sample.shape[len_dim])
+        repeated = np.tile(sample, [1, n_repeat] if is_2d else n_repeat)
+        start = np.random.randint(0, repeated.shape[len_dim] - goal_shape[len_dim] + 1)
+        return (
+            repeated[:, start : start + goal_shape[len_dim]]
+            if is_2d
+            else repeated[start : start + goal_shape[len_dim]]
+        )
+
+    def _mix(self, source, noise, snr):
+        get_power = lambda x: np.mean(x**2)
+        if get_power(noise):
+            scl = np.sqrt(
+                get_power(source) / (np.power(10, snr / 10) * get_power(noise))
+            )
+        else:
+            scl = 0
+        return 1 * source + scl * noise
+
+    def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None):
+        return self.pick_sample(goal_shape, always_2d, use_sample_rate)
+
+    def __call__(self, source, sample_rate):
+        if np.random.random() > self.rate:
+            return source, sample_rate
+
+        noise = self._get_noise(
+            source.shape, always_2d=True, use_sample_rate=sample_rate
+        )
+
+        return (
+            self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)),
+            sample_rate,
+        )
+
+
+@register_audio_waveform_transform("musicaugment")
+class MusicAugmentTransform(NoiseAugmentTransform):
+    pass
+
+
+@register_audio_waveform_transform("backgroundnoiseaugment")
+class BackgroundNoiseAugmentTransform(NoiseAugmentTransform):
+    pass
+
+
+@register_audio_waveform_transform("babbleaugment")
+class BabbleAugmentTransform(NoiseAugmentTransform):
+    def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None):
+        for i in range(np.random.randint(3, 8)):
+            speech = self.pick_sample(goal_shape, always_2d, use_sample_rate)
+            if i == 0:
+                agg_noise = speech
+            else:  # SNR scaled by i (how many noise signals already in agg_noise)
+                agg_noise = self._mix(agg_noise, speech, i)
+        return agg_noise
+
+
+@register_audio_waveform_transform("sporadicnoiseaugment")
+class SporadicNoiseAugmentTransform(NoiseAugmentTransform):
+    @classmethod
+    def from_config_dict(cls, config=None):
+        _config = {} if config is None else config
+        return cls(
+            _config.get("samples_path", None),
+            _config.get("snr_min", SNR_MIN),
+            _config.get("snr_max", SNR_MAX),
+            _config.get("rate", RATE),
+            _config.get("noise_rate", NOISE_RATE),
+            _config.get("noise_len_mean", NOISE_LEN_MEAN),
+            _config.get("noise_len_std", NOISE_LEN_STD),
+        )
+
+    def __init__(
+        self,
+        samples_path: str,
+        snr_min: float = SNR_MIN,
+        snr_max: float = SNR_MAX,
+        rate: float = RATE,
+        noise_rate: float = NOISE_RATE,  # noises per second
+        noise_len_mean: float = NOISE_LEN_MEAN,  # length of noises in seconds
+        noise_len_std: float = NOISE_LEN_STD,
+    ):
+        super().__init__(samples_path, snr_min, snr_max, rate)
+        self.noise_rate = noise_rate
+        self.noise_len_mean = noise_len_mean
+        self.noise_len_std = noise_len_std
+
+    def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None):
+        agg_noise = np.zeros(goal_shape)
+        len_dim = len(goal_shape) - 1
+        is_2d = len(goal_shape) == 2
+
+        n_noises = round(self.noise_rate * goal_shape[len_dim] / use_sample_rate)
+        start_pointers = [
+            round(rand_uniform(0, goal_shape[len_dim])) for _ in range(n_noises)
+        ]
+
+        for start_pointer in start_pointers:
+            noise_shape = list(goal_shape)
+            len_seconds = np.random.normal(self.noise_len_mean, self.noise_len_std)
+            noise_shape[len_dim] = round(max(0, len_seconds) * use_sample_rate)
+            end_pointer = start_pointer + noise_shape[len_dim]
+            if end_pointer >= goal_shape[len_dim]:
+                continue
+
+            noise = self.pick_sample(noise_shape, always_2d, use_sample_rate)
+            if is_2d:
+                agg_noise[:, start_pointer:end_pointer] = (
+                    agg_noise[:, start_pointer:end_pointer] + noise
+                )
+            else:
+                agg_noise[start_pointer:end_pointer] = (
+                    agg_noise[start_pointer:end_pointer] + noise
+                )
+
+        return agg_noise
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
new file mode 100644
index 0000000000..8f70c90df3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
@@ -0,0 +1,165 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq import utils
+
+from . import FairseqDataset
+
+
+def backtranslate_samples(samples, collate_fn, generate_fn, cuda=True):
+    """Backtranslate a list of samples.
+
+    Given an input (*samples*) of the form:
+
+        [{'id': 1, 'source': 'hallo welt'}]
+
+    this will return:
+
+        [{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}]
+
+    Args:
+        samples (List[dict]): samples to backtranslate. Individual samples are
+            expected to have a 'source' key, which will become the 'target'
+            after backtranslation.
+        collate_fn (callable): function to collate samples into a mini-batch
+        generate_fn (callable): function to generate backtranslations
+        cuda (bool): use GPU for generation (default: ``True``)
+
+    Returns:
+        List[dict]: an updated list of samples with a backtranslated source
+    """
+    collated_samples = collate_fn(samples)
+    s = utils.move_to_cuda(collated_samples) if cuda else collated_samples
+    generated_sources = generate_fn(s)
+
+    id_to_src = {sample["id"]: sample["source"] for sample in samples}
+
+    # Go through each tgt sentence in batch and its corresponding best
+    # generated hypothesis and create a backtranslation data pair
+    # {id: id, source: generated backtranslation, target: original tgt}
+    return [
+        {
+            "id": id.item(),
+            "target": id_to_src[id.item()],
+            "source": hypos[0]["tokens"].cpu(),
+        }
+        for id, hypos in zip(collated_samples["id"], generated_sources)
+    ]
+
+
+class BacktranslationDataset(FairseqDataset):
+    """
+    Sets up a backtranslation dataset which takes a tgt batch, generates
+    a src using a tgt-src backtranslation function (*backtranslation_fn*),
+    and returns the corresponding `{generated src, input tgt}` batch.
+
+    Args:
+        tgt_dataset (~fairseq.data.FairseqDataset): the dataset to be
+            backtranslated. Only the source side of this dataset will be used.
+            After backtranslation, the source sentences in this dataset will be
+            returned as the targets.
+        src_dict (~fairseq.data.Dictionary): the dictionary of backtranslated
+            sentences.
+        tgt_dict (~fairseq.data.Dictionary, optional): the dictionary of
+            sentences to be backtranslated.
+        backtranslation_fn (callable, optional): function to call to generate
+            backtranslations. This is typically the `generate` method of a
+            :class:`~fairseq.sequence_generator.SequenceGenerator` object.
+            Pass in None when it is not available at initialization time, and
+            use set_backtranslation_fn function to set it when available.
+        output_collater (callable, optional): function to call on the
+            backtranslated samples to create the final batch
+            (default: ``tgt_dataset.collater``).
+        cuda: use GPU for generation
+    """
+
+    def __init__(
+        self,
+        tgt_dataset,
+        src_dict,
+        tgt_dict=None,
+        backtranslation_fn=None,
+        output_collater=None,
+        cuda=True,
+        **kwargs
+    ):
+        self.tgt_dataset = tgt_dataset
+        self.backtranslation_fn = backtranslation_fn
+        self.output_collater = (
+            output_collater if output_collater is not None else tgt_dataset.collater
+        )
+        self.cuda = cuda if torch.cuda.is_available() else False
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+
+    def __getitem__(self, index):
+        """
+        Returns a single sample from *tgt_dataset*. Note that backtranslation is
+        not applied in this step; use :func:`collater` instead to backtranslate
+        a batch of samples.
+        """
+        return self.tgt_dataset[index]
+
+    def __len__(self):
+        return len(self.tgt_dataset)
+
+    def set_backtranslation_fn(self, backtranslation_fn):
+        self.backtranslation_fn = backtranslation_fn
+
+    def collater(self, samples):
+        """Merge and backtranslate a list of samples to form a mini-batch.
+
+        Using the samples from *tgt_dataset*, load a collated target sample to
+        feed to the backtranslation model. Then take the backtranslation with
+        the best score as the source and the original input as the target.
+
+        Note: we expect *tgt_dataset* to provide a function `collater()` that
+        will collate samples into the format expected by *backtranslation_fn*.
+        After backtranslation, we will feed the new list of samples (i.e., the
+        `(backtranslated source, original source)` pairs) to *output_collater*
+        and return the result.
+
+        Args:
+            samples (List[dict]): samples to backtranslate and collate
+
+        Returns:
+            dict: a mini-batch with keys coming from *output_collater*
+        """
+        if samples[0].get("is_dummy", False):
+            return samples
+        samples = backtranslate_samples(
+            samples=samples,
+            collate_fn=self.tgt_dataset.collater,
+            generate_fn=(lambda net_input: self.backtranslation_fn(net_input)),
+            cuda=self.cuda,
+        )
+        return self.output_collater(samples)
+
+    def num_tokens(self, index):
+        """Just use the tgt dataset num_tokens"""
+        return self.tgt_dataset.num_tokens(index)
+
+    def ordered_indices(self):
+        """Just use the tgt dataset ordered_indices"""
+        return self.tgt_dataset.ordered_indices()
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used
+        when filtering a dataset with ``--max-positions``.
+
+        Note: we use *tgt_dataset* to approximate the length of the source
+        sentence, since we do not know the actual length until after
+        backtranslation.
+        """
+        tgt_size = self.tgt_dataset.size(index)[0]
+        return (tgt_size, tgt_size)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.tgt_dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        return self.tgt_dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
new file mode 100644
index 0000000000..134d398b47
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+class BaseWrapperDataset(FairseqDataset):
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        if hasattr(self.dataset, "collater"):
+            return self.dataset.collater(samples)
+        else:
+            return default_collate(samples)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def attr(self, attr: str, index: int):
+        return self.dataset.attr(attr, index)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
+
+    def get_batch_shapes(self):
+        return self.dataset.get_batch_shapes()
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        return self.dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+        )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        return self.dataset.filter_indices_by_size(indices, max_sizes)
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return self.dataset.can_reuse_epoch_itr_across_epochs
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
new file mode 100644
index 0000000000..0f94100148
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch.nn.functional as F
+from fairseq.data import BaseWrapperDataset
+from fairseq.data.data_utils import get_buckets, get_bucketed_sizes
+
+
+class BucketPadLengthDataset(BaseWrapperDataset):
+    """
+    Bucket and pad item lengths to the nearest bucket size. This can be used to
+    reduce the number of unique batch shapes, which is important on TPUs since
+    each new batch shape requires a recompilation.
+
+    Args:
+        dataset (FairseqDatset): dataset to bucket
+        sizes (List[int]): all item sizes
+        num_buckets (int): number of buckets to create
+        pad_idx (int): padding symbol
+        left_pad (bool): if True, pad on the left; otherwise right pad
+    """
+
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        num_buckets,
+        pad_idx,
+        left_pad,
+        tensor_key=None,
+    ):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+
+        assert num_buckets > 0
+        self.buckets = get_buckets(sizes, num_buckets)
+        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
+        self._tensor_key = tensor_key
+
+    def _set_tensor(self, item, val):
+        if self._tensor_key is None:
+            return val
+        item[self._tensor_key] = val
+        return item
+
+    def _get_tensor(self, item):
+        if self._tensor_key is None:
+            return item
+        return item[self._tensor_key]
+
+    def _pad(self, tensor, bucket_size, dim=-1):
+        num_pad = bucket_size - tensor.size(dim)
+        return F.pad(
+            tensor,
+            (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad),
+            value=self.pad_idx,
+        )
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        bucket_size = self._bucketed_sizes[index]
+        tensor = self._get_tensor(item)
+        padded = self._pad(tensor, bucket_size)
+        return self._set_tensor(item, padded)
+
+    @property
+    def sizes(self):
+        return self._bucketed_sizes
+
+    def num_tokens(self, index):
+        return self._bucketed_sizes[index]
+
+    def size(self, index):
+        return self._bucketed_sizes[index]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
new file mode 100644
index 0000000000..a433091956
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
@@ -0,0 +1,576 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import json
+import logging
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.utils.data
+
+from . import data_utils
+from fairseq.data.fairseq_dataset import FairseqDataset
+
+F0_FRAME_SPACE = 0.005  # sec
+
+
+logger = logging.getLogger(__name__)
+
+
+class ExpressiveCodeDataConfig(object):
+    def __init__(self, json_path):
+        with open(json_path, "r") as f:
+            self.config = json.load(f)
+        self._manifests = self.config["manifests"]
+
+    @property
+    def manifests(self):
+        return self._manifests
+
+    @property
+    def n_units(self):
+        return self.config["n_units"]
+
+    @property
+    def sampling_rate(self):
+        return self.config["sampling_rate"]
+
+    @property
+    def code_hop_size(self):
+        return self.config["code_hop_size"]
+
+    @property
+    def f0_stats(self):
+        """pre-computed f0 statistics path"""
+        return self.config.get("f0_stats", None)
+
+    @property
+    def f0_vq_type(self):
+        """naive or precomp"""
+        return self.config["f0_vq_type"]
+
+    @property
+    def f0_vq_name(self):
+        return self.config["f0_vq_name"]
+
+    def get_f0_vq_naive_quantizer(self, log, norm_mean, norm_std):
+        key = "log" if log else "linear"
+        if norm_mean and norm_std:
+            key += "_mean_std_norm"
+        elif norm_mean:
+            key += "_mean_norm"
+        else:
+            key += "_none_norm"
+        return self.config["f0_vq_naive_quantizer"][key]
+
+    @property
+    def f0_vq_n_units(self):
+        return self.config["f0_vq_n_units"]
+
+    @property
+    def multispkr(self):
+        """how to parse speaker label from audio path"""
+        return self.config.get("multispkr", None)
+
+
+def get_f0(audio, rate=16000):
+    try:
+        import amfm_decompy.basic_tools as basic
+        import amfm_decompy.pYAAPT as pYAAPT
+        from librosa.util import normalize
+    except ImportError:
+        raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)."
+
+    assert audio.ndim == 1
+    frame_length = 20.0  # ms
+    to_pad = int(frame_length / 1000 * rate) // 2
+
+    audio = normalize(audio) * 0.95
+    audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0)
+    audio = basic.SignalObj(audio, rate)
+    pitch = pYAAPT.yaapt(
+        audio,
+        frame_length=frame_length,
+        frame_space=F0_FRAME_SPACE * 1000,
+        nccf_thresh1=0.25,
+        tda_frame_length=25.0,
+    )
+    f0 = pitch.samp_values
+    return f0
+
+
+def interpolate_f0(f0):
+    try:
+        from scipy.interpolate import interp1d
+    except ImportError:
+        raise "Please install scipy (`pip install scipy`)"
+
+    orig_t = np.arange(f0.shape[0])
+    f0_interp = f0[:]
+    ii = f0_interp != 0
+    if ii.sum() > 1:
+        f0_interp = interp1d(
+            orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0
+        )(orig_t)
+        f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device)
+    return f0_interp
+
+
+def naive_quantize(x, edges):
+    bin_idx = (x.view(-1, 1) > edges.view(1, -1)).long().sum(dim=1)
+    return bin_idx
+
+
+def load_wav(full_path):
+    try:
+        import soundfile as sf
+    except ImportError:
+        raise "Please install soundfile (`pip install SoundFile`)"
+    data, sampling_rate = sf.read(full_path)
+    return data, sampling_rate
+
+
+def parse_code(code_str, dictionary, append_eos):
+    code, duration = torch.unique_consecutive(
+        torch.ShortTensor(list(map(int, code_str.split()))), return_counts=True
+    )
+    code = " ".join(map(str, code.tolist()))
+    code = dictionary.encode_line(code, append_eos).short()
+
+    if append_eos:
+        duration = torch.cat((duration, duration.new_zeros((1,))), dim=0)  # eos
+    duration = duration.short()
+    return code, duration
+
+
+def parse_manifest(manifest, dictionary):
+    audio_files = []
+    codes = []
+    durations = []
+    speakers = []
+
+    with open(manifest) as info:
+        for line in info.readlines():
+            sample = eval(line.strip())
+            if "cpc_km100" in sample:
+                k = "cpc_km100"
+            elif "hubert_km100" in sample:
+                k = "hubert_km100"
+            elif "phone" in sample:
+                k = "phone"
+            else:
+                assert False, "unknown format"
+            code = sample[k]
+            code, duration = parse_code(code, dictionary, append_eos=True)
+
+            codes.append(code)
+            durations.append(duration)
+            audio_files.append(sample["audio"])
+            speakers.append(sample.get("speaker", None))
+
+    return audio_files, codes, durations, speakers
+
+
+def parse_speaker(path, method):
+    if type(path) == str:
+        path = Path(path)
+
+    if method == "parent_name":
+        return path.parent.name
+    elif method == "parent_parent_name":
+        return path.parent.parent.name
+    elif method == "_":
+        return path.name.split("_")[0]
+    elif method == "single":
+        return "A"
+    elif callable(method):
+        return method(path)
+    else:
+        raise NotImplementedError()
+
+
+def get_f0_by_filename(filename, tgt_sampling_rate):
+    audio, sampling_rate = load_wav(filename)
+    if sampling_rate != tgt_sampling_rate:
+        raise ValueError(
+            "{} SR doesn't match target {} SR".format(sampling_rate, tgt_sampling_rate)
+        )
+
+    # compute un-interpolated f0, and use Ann's interp in __getitem__ if set
+    f0 = get_f0(audio, rate=tgt_sampling_rate)
+    f0 = torch.from_numpy(f0.astype(np.float32))
+    return f0
+
+
+def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1):
+    code_len = durations.sum()
+    targ_len = int(f0_code_ratio * code_len)
+    diff = f0.size(0) - targ_len
+    assert abs(diff) <= tol, (
+        f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|"
+        f" > {tol} (dur=\n{durations})"
+    )
+    if diff > 0:
+        f0 = f0[:targ_len]
+    elif diff < 0:
+        f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0)
+
+    f0_offset = 0.0
+    seg_f0s = []
+    for dur in durations:
+        f0_dur = dur.item() * f0_code_ratio
+        seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)]
+        seg_f0 = seg_f0[seg_f0 != 0]
+        if len(seg_f0) == 0:
+            seg_f0 = torch.tensor(0).type(seg_f0.type())
+        else:
+            seg_f0 = seg_f0.mean()
+        seg_f0s.append(seg_f0)
+        f0_offset += f0_dur
+
+    assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}"
+    return torch.tensor(seg_f0s)
+
+
+class Paddings(object):
+    def __init__(self, code_val, dur_val=0, f0_val=-2.0):
+        self.code = code_val
+        self.dur = dur_val
+        self.f0 = f0_val
+
+
+class Shifts(object):
+    def __init__(self, shifts_str, pads):
+        self._shifts = list(map(int, shifts_str.split(",")))
+        assert len(self._shifts) == 2, self._shifts
+        assert all(s >= 0 for s in self._shifts)
+        self.extra_length = max(s for s in self._shifts)
+        self.pads = pads
+
+    @property
+    def dur(self):
+        return self._shifts[0]
+
+    @property
+    def f0(self):
+        return self._shifts[1]
+
+    @staticmethod
+    def shift_one(seq, left_pad_num, right_pad_num, pad):
+        assert seq.ndim == 1
+        bos = seq.new_full((left_pad_num,), pad)
+        eos = seq.new_full((right_pad_num,), pad)
+        seq = torch.cat([bos, seq, eos])
+        mask = torch.ones_like(seq).bool()
+        mask[left_pad_num : len(seq) - right_pad_num] = 0
+        return seq, mask
+
+    def __call__(self, code, dur, f0):
+        if self.extra_length == 0:
+            code_mask = torch.zeros_like(code).bool()
+            dur_mask = torch.zeros_like(dur).bool()
+            f0_mask = torch.zeros_like(f0).bool()
+            return code, code_mask, dur, dur_mask, f0, f0_mask
+
+        code, code_mask = self.shift_one(code, 0, self.extra_length, self.pads.code)
+        dur, dur_mask = self.shift_one(
+            dur, self.dur, self.extra_length - self.dur, self.pads.dur
+        )
+        f0, f0_mask = self.shift_one(
+            f0, self.f0, self.extra_length - self.f0, self.pads.f0
+        )
+        return code, code_mask, dur, dur_mask, f0, f0_mask
+
+
+class CodeDataset(FairseqDataset):
+    def __init__(
+        self,
+        manifest,
+        dictionary,
+        dur_dictionary,
+        f0_dictionary,
+        config,
+        discrete_dur,
+        discrete_f0,
+        log_f0,
+        normalize_f0_mean,
+        normalize_f0_std,
+        interpolate_f0,
+        return_filename=False,
+        strip_filename=True,
+        shifts="0,0",
+        return_continuous_f0=False,
+    ):
+        random.seed(1234)
+        self.dictionary = dictionary
+        self.dur_dictionary = dur_dictionary
+        self.f0_dictionary = f0_dictionary
+        self.config = config
+
+        # duration config
+        self.discrete_dur = discrete_dur
+
+        # pitch config
+        self.discrete_f0 = discrete_f0
+        self.log_f0 = log_f0
+        self.normalize_f0_mean = normalize_f0_mean
+        self.normalize_f0_std = normalize_f0_std
+        self.interpolate_f0 = interpolate_f0
+
+        self.return_filename = return_filename
+        self.strip_filename = strip_filename
+        self.f0_code_ratio = config.code_hop_size / (
+            config.sampling_rate * F0_FRAME_SPACE
+        )
+
+        # use lazy loading to avoid sharing file handlers across workers
+        self.manifest = manifest
+        self._codes = None
+        self._durs = None
+        self._f0s = None
+        with open(f"{manifest}.leng.txt", "r") as f:
+            lengs = [int(line.rstrip()) for line in f]
+            edges = np.cumsum([0] + lengs)
+            self.starts, self.ends = edges[:-1], edges[1:]
+        with open(f"{manifest}.path.txt", "r") as f:
+            self.file_names = [line.rstrip() for line in f]
+        logger.info(f"num entries: {len(self.starts)}")
+
+        if os.path.exists(f"{manifest}.f0_stat.pt"):
+            self.f0_stats = torch.load(f"{manifest}.f0_stat.pt")
+        elif config.f0_stats:
+            self.f0_stats = torch.load(config.f0_stats)
+
+        self.multispkr = config.multispkr
+        if config.multispkr:
+            with open(f"{manifest}.speaker.txt", "r") as f:
+                self.spkrs = [line.rstrip() for line in f]
+            self.id_to_spkr = sorted(self.spkrs)
+            self.spkr_to_id = {k: v for v, k in enumerate(self.id_to_spkr)}
+
+        self.pads = Paddings(
+            dictionary.pad(),
+            0,  # use 0 for duration padding
+            f0_dictionary.pad() if discrete_f0 else -5.0,
+        )
+        self.shifts = Shifts(shifts, pads=self.pads)
+        self.return_continuous_f0 = return_continuous_f0
+
+    def get_data_handlers(self):
+        logging.info(f"loading data for {self.manifest}")
+        self._codes = np.load(f"{self.manifest}.code.npy", mmap_mode="r")
+        self._durs = np.load(f"{self.manifest}.dur.npy", mmap_mode="r")
+
+        if self.discrete_f0:
+            if self.config.f0_vq_type == "precomp":
+                self._f0s = np.load(
+                    f"{self.manifest}.{self.config.f0_vq_name}.npy", mmap_mode="r"
+                )
+            elif self.config.f0_vq_type == "naive":
+                self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r")
+                quantizers_path = self.config.get_f0_vq_naive_quantizer(
+                    self.log_f0, self.normalize_f0_mean, self.normalize_f0_std
+                )
+                quantizers = torch.load(quantizers_path)
+                n_units = self.config.f0_vq_n_units
+                self._f0_quantizer = torch.from_numpy(quantizers[n_units])
+            else:
+                raise ValueError(f"f0_vq_type {self.config.f0_vq_type} not supported")
+        else:
+            self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r")
+
+    def preprocess_f0(self, f0, stats):
+        """
+        1. interpolate
+        2. log transform (keep unvoiced frame 0)
+        """
+        # TODO: change this to be dependent on config for naive quantizer
+        f0 = f0.clone()
+        if self.interpolate_f0:
+            f0 = interpolate_f0(f0)
+
+        mask = f0 != 0  # only process voiced frames
+        if self.log_f0:
+            f0[mask] = f0[mask].log()
+        if self.normalize_f0_mean:
+            mean = stats["logf0_mean"] if self.log_f0 else stats["f0_mean"]
+            f0[mask] = f0[mask] - mean
+        if self.normalize_f0_std:
+            std = stats["logf0_std"] if self.log_f0 else stats["f0_std"]
+            f0[mask] = f0[mask] / std
+        return f0
+
+    def _get_raw_item(self, index):
+        start, end = self.starts[index], self.ends[index]
+        if self._codes is None:
+            self.get_data_handlers()
+        code = torch.from_numpy(np.array(self._codes[start:end])).long()
+        dur = torch.from_numpy(np.array(self._durs[start:end]))
+        f0 = torch.from_numpy(np.array(self._f0s[start:end]))
+        return code, dur, f0
+
+    def __getitem__(self, index):
+        code, dur, f0 = self._get_raw_item(index)
+        code = torch.cat([code.new([self.dictionary.bos()]), code])
+
+        # use 0 for eos and bos
+        dur = torch.cat([dur.new([0]), dur])
+        if self.discrete_dur:
+            dur = self.dur_dictionary.encode_line(
+                " ".join(map(str, dur.tolist())), append_eos=False
+            ).long()
+        else:
+            dur = dur.float()
+
+        # TODO: find a more elegant approach
+        raw_f0 = None
+        if self.discrete_f0:
+            if self.config.f0_vq_type == "precomp":
+                f0 = self.f0_dictionary.encode_line(
+                    " ".join(map(str, f0.tolist())), append_eos=False
+                ).long()
+            else:
+                f0 = f0.float()
+                f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]])
+                if self.return_continuous_f0:
+                    raw_f0 = f0
+                    raw_f0 = torch.cat([raw_f0.new([self.f0_dictionary.bos()]), raw_f0])
+                f0 = naive_quantize(f0, self._f0_quantizer)
+            f0 = torch.cat([f0.new([self.f0_dictionary.bos()]), f0])
+        else:
+            f0 = f0.float()
+            if self.multispkr:
+                f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]])
+            else:
+                f0 = self.preprocess_f0(f0, self.f0_stats)
+            f0 = torch.cat([f0.new([0]), f0])
+
+        if raw_f0 is not None:
+            *_, raw_f0, raw_f0_mask = self.shifts(code, dur, raw_f0)
+        else:
+            raw_f0_mask = None
+
+        code, code_mask, dur, dur_mask, f0, f0_mask = self.shifts(code, dur, f0)
+        if raw_f0_mask is not None:
+            assert (raw_f0_mask == f0_mask).all()
+
+        # is a padded frame if either input or output is padded
+        feats = {
+            "source": code[:-1],
+            "target": code[1:],
+            "mask": code_mask[1:].logical_or(code_mask[:-1]),
+            "dur_source": dur[:-1],
+            "dur_target": dur[1:],
+            "dur_mask": dur_mask[1:].logical_or(dur_mask[:-1]),
+            "f0_source": f0[:-1],
+            "f0_target": f0[1:],
+            "f0_mask": f0_mask[1:].logical_or(f0_mask[:-1]),
+        }
+
+        if raw_f0 is not None:
+            feats["raw_f0"] = raw_f0[1:]
+
+        if self.return_filename:
+            fname = self.file_names[index]
+            feats["filename"] = (
+                fname if not self.strip_filename else Path(fname).with_suffix("").name
+            )
+        return feats
+
+    def __len__(self):
+        return len(self.starts)
+
+    def size(self, index):
+        return self.ends[index] - self.starts[index] + self.shifts.extra_length
+
+    def num_tokens(self, index):
+        return self.size(index)
+
+    def collater(self, samples):
+        pad_idx, eos_idx = self.dictionary.pad(), self.dictionary.eos()
+        if len(samples) == 0:
+            return {}
+
+        src_tokens = data_utils.collate_tokens(
+            [s["source"] for s in samples], pad_idx, eos_idx, left_pad=False
+        )
+
+        tgt_tokens = data_utils.collate_tokens(
+            [s["target"] for s in samples],
+            pad_idx=pad_idx,
+            eos_idx=pad_idx,  # appending padding, eos is there already
+            left_pad=False,
+        )
+
+        src_durs, tgt_durs = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=self.pads.dur,
+                eos_idx=self.pads.dur,
+                left_pad=False,
+            )
+            for k in ["dur_source", "dur_target"]
+        ]
+
+        src_f0s, tgt_f0s = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=self.pads.f0,
+                eos_idx=self.pads.f0,
+                left_pad=False,
+            )
+            for k in ["f0_source", "f0_target"]
+        ]
+
+        mask, dur_mask, f0_mask = [
+            data_utils.collate_tokens(
+                [s[k] for s in samples],
+                pad_idx=1,
+                eos_idx=1,
+                left_pad=False,
+            )
+            for k in ["mask", "dur_mask", "f0_mask"]
+        ]
+
+        src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
+        n_tokens = sum(len(s["source"]) for s in samples)
+
+        result = {
+            "nsentences": len(samples),
+            "ntokens": n_tokens,
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "dur_src": src_durs,
+                "f0_src": src_f0s,
+            },
+            "target": tgt_tokens,
+            "dur_target": tgt_durs,
+            "f0_target": tgt_f0s,
+            "mask": mask,
+            "dur_mask": dur_mask,
+            "f0_mask": f0_mask,
+        }
+
+        if "filename" in samples[0]:
+            result["filename"] = [s["filename"] for s in samples]
+
+        # TODO: remove this hack into the inference dataset
+        if "prefix" in samples[0]:
+            result["prefix"] = [s["prefix"] for s in samples]
+
+        if "raw_f0" in samples[0]:
+            raw_f0s = data_utils.collate_tokens(
+                [s["raw_f0"] for s in samples],
+                pad_idx=self.pads.f0,
+                eos_idx=self.pads.f0,
+                left_pad=False,
+            )
+            result["raw_f0"] = raw_f0s
+        return result
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
new file mode 100644
index 0000000000..7a6d271379
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset
+
+
+class ColorizeDataset(BaseWrapperDataset):
+    """Adds 'colors' property to net input that is obtained from the provided color getter for use by models"""
+
+    def __init__(self, dataset, color_getter):
+        super().__init__(dataset)
+        self.color_getter = color_getter
+
+    def collater(self, samples):
+        base_collate = super().collater(samples)
+        if len(base_collate) > 0:
+            base_collate["net_input"]["colors"] = torch.tensor(
+                list(self.color_getter(self.dataset, s["id"]) for s in samples),
+                dtype=torch.long,
+            )
+        return base_collate
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
new file mode 100644
index 0000000000..01a4078bb1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
@@ -0,0 +1,124 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import bisect
+
+import numpy as np
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+class ConcatDataset(FairseqDataset):
+    @staticmethod
+    def cumsum(sequence, sample_ratios):
+        r, s = [], 0
+        for e, ratio in zip(sequence, sample_ratios):
+            curr_len = int(ratio * len(e))
+            r.append(curr_len + s)
+            s += curr_len
+        return r
+
+    def __init__(self, datasets, sample_ratios=1):
+        super(ConcatDataset, self).__init__()
+        assert len(datasets) > 0, "datasets should not be an empty iterable"
+        self.datasets = list(datasets)
+        if isinstance(sample_ratios, int):
+            sample_ratios = [sample_ratios] * len(self.datasets)
+        self.sample_ratios = sample_ratios
+        self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios)
+        self.real_sizes = [len(d) for d in self.datasets]
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return self.datasets[dataset_idx][sample_idx]
+
+    def _get_dataset_and_sample_index(self, idx: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        sample_idx = sample_idx % self.real_sizes[dataset_idx]
+        return dataset_idx, sample_idx
+
+    def collater(self, samples, **extra_args):
+        # For now only supports datasets with same underlying collater implementations
+        if hasattr(self.datasets[0], "collater"):
+            return self.datasets[0].collater(samples, **extra_args)
+        else:
+            return default_collate(samples, **extra_args)
+
+    def size(self, idx: int):
+        """
+        Return an example's size as a float or tuple.
+        """
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return self.datasets[dataset_idx].size(sample_idx)
+
+    def num_tokens(self, index: int):
+        return np.max(self.size(index))
+
+    def attr(self, attr: str, index: int):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, index)
+        return getattr(self.datasets[dataset_idx], attr, None)
+
+    @property
+    def sizes(self):
+        _dataset_sizes = []
+        for ds, sr in zip(self.datasets, self.sample_ratios):
+            if isinstance(ds.sizes, np.ndarray):
+                _dataset_sizes.append(np.tile(ds.sizes, sr))
+            else:
+                # Only support underlying dataset with single size array.
+                assert isinstance(ds.sizes, list)
+                _dataset_sizes.append(np.tile(ds.sizes[0], sr))
+        return np.concatenate(_dataset_sizes)
+
+    @property
+    def supports_prefetch(self):
+        return all(d.supports_prefetch for d in self.datasets)
+
+    def ordered_indices(self):
+        """
+        Returns indices sorted by length. So less padding is needed.
+        """
+        if isinstance(self.sizes, np.ndarray) and len(self.sizes.shape) > 1:
+            # special handling for concatenating lang_pair_datasets
+            indices = np.arange(len(self))
+            sizes = self.sizes
+            tgt_sizes = (
+                sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
+            )
+            src_sizes = (
+                sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
+            )
+            # sort by target length, then source length
+            if tgt_sizes is not None:
+                indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")]
+            return indices[np.argsort(src_sizes[indices], kind="mergesort")]
+        else:
+            return np.argsort(self.sizes)
+
+    def prefetch(self, indices):
+        frm = 0
+        for to, ds in zip(self.cumulative_sizes, self.datasets):
+            real_size = len(ds)
+            if getattr(ds, "supports_prefetch", False):
+                ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to])
+            frm = to
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return all(d.can_reuse_epoch_itr_across_epochs for d in self.datasets)
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.datasets:
+            if hasattr(ds, "set_epoch"):
+                ds.set_epoch(epoch)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
new file mode 100644
index 0000000000..625a29370e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class ConcatSentencesDataset(FairseqDataset):
+    def __init__(self, *datasets):
+        super().__init__()
+        self.datasets = datasets
+        assert all(
+            len(ds) == len(datasets[0]) for ds in datasets
+        ), "datasets must have the same length"
+
+    def __getitem__(self, index):
+        return torch.cat([ds[index] for ds in self.datasets])
+
+    def __len__(self):
+        return len(self.datasets[0])
+
+    def collater(self, samples):
+        return self.datasets[0].collater(samples)
+
+    @property
+    def sizes(self):
+        return sum(ds.sizes for ds in self.datasets)
+
+    def num_tokens(self, index):
+        return sum(ds.num_tokens(index) for ds in self.datasets)
+
+    def size(self, index):
+        return sum(ds.size(index) for ds in self.datasets)
+
+    def ordered_indices(self):
+        return self.datasets[0].ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return any(getattr(ds, "supports_prefetch", False) for ds in self.datasets)
+
+    def prefetch(self, indices):
+        for ds in self.datasets:
+            if getattr(ds, "supports_prefetch", False):
+                ds.prefetch(indices)
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.datasets:
+            if hasattr(ds, "set_epoch"):
+                ds.set_epoch(epoch)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
new file mode 100644
index 0000000000..9a19cc3c18
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
@@ -0,0 +1,1144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
+import contextlib
+import itertools
+import logging
+import re
+import warnings
+from typing import Optional, Tuple
+
+import math
+import numpy as np
+import torch
+
+from fairseq.file_io import PathManager
+from fairseq import utils
+import os
+
+logger = logging.getLogger(__name__)
+
+
+def infer_language_pair(path):
+    """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
+    src, dst = None, None
+    for filename in PathManager.ls(path):
+        parts = filename.split(".")
+        if len(parts) >= 3 and len(parts[1].split("-")) == 2:
+            return parts[1].split("-")
+    return src, dst
+
+
+def collate_tokens(
+    values,
+    pad_idx,
+    eos_idx=None,
+    left_pad=False,
+    move_eos_to_beginning=False,
+    pad_to_length=None,
+    pad_to_multiple=1,
+    pad_to_bsz=None,
+):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    size = size if pad_to_length is None else max(size, pad_to_length)
+    if pad_to_multiple != 1 and size % pad_to_multiple != 0:
+        size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
+
+    batch_size = len(values) if pad_to_bsz is None else max(len(values), pad_to_bsz)
+    res = values[0].new(batch_size, size).fill_(pad_idx)
+
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        if move_eos_to_beginning:
+            if eos_idx is None:
+                # if no eos_idx is specified, then use the last token in src
+                dst[0] = src[-1]
+            else:
+                dst[0] = eos_idx
+            dst[1:] = src[:-1]
+        else:
+            dst.copy_(src)
+
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v) :] if left_pad else res[i][: len(v)])
+    return res
+
+
+def load_indexed_dataset(
+    path, dictionary=None, dataset_impl=None, combine=False, default="cached"
+):
+    """A helper function for loading indexed datasets.
+
+    Args:
+        path (str): path to indexed dataset (e.g., 'data-bin/train')
+        dictionary (~fairseq.data.Dictionary): data dictionary
+        dataset_impl (str, optional): which dataset implementation to use. If
+            not provided, it will be inferred automatically. For legacy indexed
+            data we use the 'cached' implementation by default.
+        combine (bool, optional): automatically load and combine multiple
+            datasets. For example, if *path* is 'data-bin/train', then we will
+            combine 'data-bin/train', 'data-bin/train1', ... and return a
+            single ConcatDataset instance.
+    """
+    import fairseq.data.indexed_dataset as indexed_dataset
+    from fairseq.data.concat_dataset import ConcatDataset
+
+    datasets = []
+    for k in itertools.count():
+        path_k = path + (str(k) if k > 0 else "")
+        try:
+            path_k = indexed_dataset.get_indexed_dataset_to_local(path_k)
+        except Exception as e:
+            if "StorageException: [404] Path not found" in str(e):
+                logger.warning(f"path_k: {e} not found")
+            else:
+                raise e
+
+        dataset_impl_k = dataset_impl
+        if dataset_impl_k is None:
+            dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k)
+        dataset = indexed_dataset.make_dataset(
+            path_k,
+            impl=dataset_impl_k or default,
+            fix_lua_indexing=True,
+            dictionary=dictionary,
+        )
+        if dataset is None:
+            break
+        logger.info("loaded {:,} examples from: {}".format(len(dataset), path_k))
+        datasets.append(dataset)
+        if not combine:
+            break
+    if len(datasets) == 0:
+        return None
+    elif len(datasets) == 1:
+        return datasets[0]
+    else:
+        return ConcatDataset(datasets)
+
+
+@contextlib.contextmanager
+def numpy_seed(seed, *addl_seeds):
+    """Context manager which seeds the NumPy PRNG with the specified seed and
+    restores the state afterward"""
+    if seed is None:
+        yield
+        return
+    if len(addl_seeds) > 0:
+        seed = int(hash((seed, *addl_seeds)) % 1e6)
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
+
+
+def collect_filtered(function, iterable, filtered):
+    """
+    Similar to :func:`filter` but collects filtered elements in ``filtered``.
+
+    Args:
+        function (callable): function that returns ``False`` for elements that
+            should be filtered
+        iterable (iterable): iterable to filter
+        filtered (list): list to store filtered elements
+    """
+    for el in iterable:
+        if function(el):
+            yield el
+        else:
+            filtered.append(el)
+
+
+def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False):
+    def compare_leq(a, b):
+        return a <= b if not isinstance(a, tuple) else max(a) <= b
+
+    def check_size(idx):
+        if isinstance(max_positions, float) or isinstance(max_positions, int):
+            return size_fn(idx) <= max_positions
+        elif isinstance(max_positions, dict):
+            idx_size = size_fn(idx)
+            assert isinstance(idx_size, dict)
+            intersect_keys = set(max_positions.keys()) & set(idx_size.keys())
+            return all(
+                all(
+                    a is None or b is None or a <= b
+                    for a, b in zip(idx_size[key], max_positions[key])
+                )
+                for key in intersect_keys
+            )
+        else:
+            # For MultiCorpusSampledDataset, will generalize it later
+            if not isinstance(size_fn(idx), Iterable):
+                return all(size_fn(idx) <= b for b in max_positions)
+            return all(
+                a is None or b is None or a <= b
+                for a, b in zip(size_fn(idx), max_positions)
+            )
+
+    ignored = []
+    itr = collect_filtered(check_size, indices, ignored)
+    indices = np.fromiter(itr, dtype=np.int64, count=-1)
+    return indices, ignored
+
+
+def filter_by_size(indices, dataset, max_positions, raise_exception=False):
+    """
+    [deprecated] Filter indices based on their size.
+    Use `FairseqDataset::filter_indices_by_size` instead.
+
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        dataset (FairseqDataset): fairseq dataset instance
+        max_positions (tuple): filter elements larger than this size.
+            Comparisons are done component-wise.
+        raise_exception (bool, optional): if ``True``, raise an exception if
+            any elements are filtered (default: False).
+    """
+    warnings.warn(
+        "data_utils.filter_by_size is deprecated. "
+        "Use `FairseqDataset::filter_indices_by_size` instead.",
+        stacklevel=2,
+    )
+    if isinstance(max_positions, float) or isinstance(max_positions, int):
+        if hasattr(dataset, "sizes") and isinstance(dataset.sizes, np.ndarray):
+            ignored = indices[dataset.sizes[indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[indices] <= max_positions]
+        elif (
+            hasattr(dataset, "sizes")
+            and isinstance(dataset.sizes, list)
+            and len(dataset.sizes) == 1
+        ):
+            ignored = indices[dataset.sizes[0][indices] > max_positions].tolist()
+            indices = indices[dataset.sizes[0][indices] <= max_positions]
+        else:
+            indices, ignored = _filter_by_size_dynamic(
+                indices, dataset.size, max_positions
+            )
+    else:
+        indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
+
+    if len(ignored) > 0 and raise_exception:
+        raise Exception(
+            (
+                "Size of sample #{} is invalid (={}) since max_positions={}, "
+                "skip this example with --skip-invalid-size-inputs-valid-test"
+            ).format(ignored[0], dataset.size(ignored[0]), max_positions)
+        )
+    if len(ignored) > 0:
+        logger.warning(
+            (
+                "{} samples have invalid sizes and will be skipped, "
+                "max_positions={}, first few sample ids={}"
+            ).format(len(ignored), max_positions, ignored[:10])
+        )
+    return indices
+
+
+def filter_paired_dataset_indices_by_size(src_sizes, tgt_sizes, indices, max_sizes):
+    """Filter a list of sample indices. Remove those that are longer
+        than specified in max_sizes.
+
+    Args:
+        indices (np.array): original array of sample indices
+        max_sizes (int or list[int] or tuple[int]): max sample size,
+            can be defined separately for src and tgt (then list or tuple)
+
+    Returns:
+        np.array: filtered sample array
+        list: list of removed indices
+    """
+    if max_sizes is None:
+        return indices, []
+    if type(max_sizes) in (int, float):
+        max_src_size, max_tgt_size = max_sizes, max_sizes
+    else:
+        max_src_size, max_tgt_size = max_sizes
+    if tgt_sizes is None:
+        ignored = indices[src_sizes[indices] > max_src_size]
+    else:
+        ignored = indices[
+            (src_sizes[indices] > max_src_size) | (tgt_sizes[indices] > max_tgt_size)
+        ]
+    if len(ignored) > 0:
+        if tgt_sizes is None:
+            indices = indices[src_sizes[indices] <= max_src_size]
+        else:
+            indices = indices[
+                (src_sizes[indices] <= max_src_size)
+                & (tgt_sizes[indices] <= max_tgt_size)
+            ]
+    return indices, ignored.tolist()
+
+
+def batch_by_size(
+    indices,
+    num_tokens_fn,
+    num_tokens_vec=None,
+    max_tokens=None,
+    max_sentences=None,
+    required_batch_size_multiple=1,
+    fixed_shapes=None,
+):
+    """
+    Yield mini-batches of indices bucketed by size. Batches may contain
+    sequences of different lengths.
+
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        num_tokens_fn (callable): function that returns the number of tokens at
+            a given index
+        num_tokens_vec (List[int], optional): precomputed vector of the number
+            of tokens for each index in indices (to enable faster batch generation)
+        max_tokens (int, optional): max number of tokens in each batch
+            (default: None).
+        max_sentences (int, optional): max number of sentences in each
+            batch (default: None).
+        required_batch_size_multiple (int, optional): require batch size to
+            be less than N or a multiple of N (default: 1).
+        fixed_shapes (List[Tuple[int, int]], optional): if given, batches will
+            only be created with the given shapes. *max_sentences* and
+            *required_batch_size_multiple* will be ignored (default: None).
+    """
+    try:
+        from fairseq.data.data_utils_fast import (
+            batch_by_size_fn,
+            batch_by_size_vec,
+            batch_fixed_shapes_fast,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please build Cython components with: "
+            "`python setup.py build_ext --inplace`"
+        )
+    except ValueError:
+        raise ValueError(
+            "Please build (or rebuild) Cython components with `python setup.py build_ext --inplace`."
+        )
+
+    # added int() to avoid TypeError: an integer is required
+    max_tokens = int(max_tokens) if max_tokens is not None else -1
+    max_sentences = max_sentences if max_sentences is not None else -1
+    bsz_mult = required_batch_size_multiple
+
+    if not isinstance(indices, np.ndarray):
+        indices = np.fromiter(indices, dtype=np.int64, count=-1)
+
+    if num_tokens_vec is not None and not isinstance(num_tokens_vec, np.ndarray):
+        num_tokens_vec = np.fromiter(num_tokens_vec, dtype=np.int64, count=-1)
+
+    if fixed_shapes is None:
+        if num_tokens_vec is None:
+            b = batch_by_size_fn(
+                indices,
+                num_tokens_fn,
+                max_tokens,
+                max_sentences,
+                bsz_mult,
+            )
+        else:
+            b = batch_by_size_vec(
+                indices,
+                num_tokens_vec,
+                max_tokens,
+                max_sentences,
+                bsz_mult,
+            )
+
+        if bsz_mult > 1 and len(b[-1]) % bsz_mult != 0:
+            b = b[:-1]
+
+        return b
+
+    else:
+        fixed_shapes = np.array(fixed_shapes, dtype=np.int64)
+        sort_order = np.lexsort(
+            [
+                fixed_shapes[:, 1].argsort(),  # length
+                fixed_shapes[:, 0].argsort(),  # bsz
+            ]
+        )
+        fixed_shapes_sorted = fixed_shapes[sort_order]
+        return batch_fixed_shapes_fast(indices, num_tokens_fn, fixed_shapes_sorted)
+
+
+def post_process(sentence: str, symbol: str):
+    if symbol == "sentencepiece":
+        sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
+    elif symbol == "wordpiece":
+        sentence = sentence.replace(" ", "").replace("_", " ").strip()
+    elif symbol == "letter":
+        sentence = sentence.replace(" ", "").replace("|", " ").strip()
+    elif symbol == "silence":
+        import re
+
+        sentence = sentence.replace("<SIL>", "")
+        sentence = re.sub(" +", " ", sentence).strip()
+    elif symbol == "_EOW":
+        sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
+    elif symbol in {"subword_nmt", "@@ ", "@@"}:
+        if symbol == "subword_nmt":
+            symbol = "@@ "
+        sentence = (sentence + " ").replace(symbol, "").rstrip()
+    elif symbol == "none":
+        pass
+    elif symbol is not None:
+        raise NotImplementedError(f"Unknown post_process option: {symbol}")
+    return sentence
+
+
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    add_masks: bool = False,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+    idc_select_ver: int = 1,  # 2 to reproduce mask_tokens_dataset
+    num_mask_ver: int = 2,  # 2 to reproduce mask_tokens_dataset
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    if num_mask_ver == 1:
+        all_num_mask = int(
+            # add a random number for probabilistic rounding
+            mask_prob * all_sz / float(mask_length)
+            + np.random.rand()
+        )
+        all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+
+        rng = np.random.default_rng(seed_i)
+
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            assert sz >= 0, sz
+        else:
+            sz = all_sz
+
+        if num_mask_ver == 1:
+            if padding_mask is not None:
+                num_mask = int(
+                    # add a random number for probabilistic rounding
+                    mask_prob * sz / float(mask_length)
+                    + np.random.rand()
+                )
+                num_mask = max(min_masks, num_mask)
+            else:
+                num_mask = all_num_mask
+        elif num_mask_ver == 2:
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            raise ValueError()
+
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+
+        if sum(lengths) == 0:
+            if mask_type == "static":
+                raise ValueError(f"this should never happens")
+            else:
+                lengths = [min(mask_length, sz - 1)]
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            if idc_select_ver == 1:
+                min_len = min(lengths)
+                if sz - min_len <= num_mask:
+                    min_len = sz - num_mask - 1
+                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            elif idc_select_ver == 2:
+                mask_idc = rng.choice(sz, num_mask, replace=False)
+            else:
+                raise ValueError()
+
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+
+        mask_idc = np.unique(mask_idc[mask_idc < sz])
+        if len(mask_idc) >= sz:
+            raise ValueError(
+                (
+                    f"the entire sequence is masked. "
+                    f"sz={sz}; mask_idc[mask_idc]; "
+                    f"index={indices[i] if indices is not None else None}"
+                )
+            )
+        mask_idcs.append(mask_idc)
+
+    target_len = None
+    if require_same_masks:
+        if add_masks:
+            target_len = max([len(m) for m in mask_idcs])
+        else:
+            target_len = min([len(m) for m in mask_idcs])
+
+    for i, mask_idc in enumerate(mask_idcs):
+        if target_len is not None and len(mask_idc) > target_len:
+            mask_idc = rng.choice(mask_idc, target_len, replace=False)
+
+        mask[i, mask_idc] = True
+
+        if target_len is not None and len(mask_idc) < target_len:
+            unmasked = np.flatnonzero(~mask[i])
+            to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
+            mask[i, to_mask] = True
+
+        if mask_dropout > 0:
+            masked = np.flatnonzero(mask[i])
+            num_holes = np.rint(len(masked) * mask_dropout).astype(int)
+            to_drop = rng.choice(masked, num_holes, replace=False)
+            mask[i, to_drop] = False
+
+    return mask
+
+
+def compute_block_mask_2d(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    mask_prob_adjust: float = 0,
+    inverse_mask: bool = False,
+    require_same_masks: bool = True,
+    expand_adjcent: bool = False,
+    mask_dropout: float = 0,
+    non_overlapping: bool = False,
+) -> torch.Tensor:
+
+    assert mask_length > 1
+
+    B, L = shape
+
+    d = int(L**0.5)
+
+    if inverse_mask:
+        mask_prob = 1 - mask_prob
+
+    if non_overlapping:
+        sz = math.ceil(d / mask_length)
+        inp_len = sz * sz
+
+        inp = torch.zeros((B, 1, sz, sz))
+        w = torch.ones((1, 1, mask_length, mask_length))
+
+        mask_inds = torch.multinomial(
+            1 - inp.view(B, -1),
+            int(inp_len * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
+            replacement=False,
+        )
+        inp.view(B, -1).scatter_(1, mask_inds, 1)
+
+        mask = torch.nn.functional.conv_transpose2d(inp, w, stride=mask_length).squeeze(
+            1
+        )
+        if mask.size(-1) > d:
+            mask = mask[..., :d, :d]
+    else:
+        mask = torch.zeros((B, d, d))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / mask_length**2)
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+
+        inds = ([], [], [])
+
+        offset = mask_length // 2
+        for i in range(mask_length):
+            for j in range(mask_length):
+                k1 = i - offset
+                k2 = j - offset
+                inds[0].append(centers[0])
+                inds[1].append(centers[1] + k1)
+                inds[2].append(centers[2] + k2)
+
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=d - 1)
+        i2 = torch.cat(inds[2]).clamp_(min=0, max=d - 1)
+
+        mask[(i0, i1, i2)] = 1
+
+    def get_nbs(b, m, w):
+        all_nbs = torch.nn.functional.conv2d(m.unsqueeze(1), w, padding="same")
+        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
+        return all_nbs
+
+    if require_same_masks and expand_adjcent:
+        w = torch.zeros((1, 1, 3, 3))
+        w[..., 0, 1] = 1
+        w[..., 2, 1] = 1
+        w[..., 1, 0] = 1
+        w[..., 1, 2] = 1
+
+        all_nbs = get_nbs(B, mask, w)
+
+    mask = mask.reshape(B, -1)
+
+    if require_same_masks:
+        n_masks = mask.sum(dim=-1)
+        final_target_len = int(L * (mask_prob))
+        target_len = int(final_target_len * (1 + mask_dropout))
+
+        for i in range(len(mask)):
+            n = n_masks[i]
+            m = mask[i]
+            r = 0
+            while expand_adjcent and n < target_len:
+                if r == 0:
+                    nbs = all_nbs[i]
+                else:
+                    nbs = get_nbs(1, m.view(1, d, d), w).flatten()
+
+                cands = (1 - m + nbs) > 1
+                cand_sz = int(cands.sum().item())
+
+                assert cand_sz > 0, f"{nbs} {cand_sz}"
+
+                to_mask = torch.multinomial(
+                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
+                )
+                m[to_mask] = 1
+                assert to_mask.numel() > 0
+                n += to_mask.numel()
+                r += 1
+
+            if n > final_target_len:
+                to_unmask = torch.multinomial(
+                    m, int(n - final_target_len), replacement=False
+                )
+                m[to_unmask] = 0
+            elif n < final_target_len:
+                to_mask = torch.multinomial(
+                    (1 - m), int(final_target_len - n), replacement=False
+                )
+                m[to_mask] = 1
+
+    if inverse_mask:
+        mask = 1 - mask
+
+    return mask
+
+
+def compute_block_mask_1d(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    mask_prob_adjust: float = 0,
+    inverse_mask: bool = False,
+    require_same_masks: bool = True,
+    expand_adjcent: bool = False,
+    mask_dropout: float = 0,
+    non_overlapping: bool = False,
+) -> torch.Tensor:
+
+    B, L = shape
+
+    if inverse_mask:
+        mask_prob = 1 - mask_prob
+
+    if non_overlapping:
+        sz = math.ceil(L / mask_length)
+
+        inp = torch.zeros((B, 1, sz))
+        w = torch.ones((1, 1, mask_length))
+
+        mask_inds = torch.multinomial(
+            1 - inp.view(B, -1),
+            int(sz * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)),
+            replacement=False,
+        )
+        inp.view(B, -1).scatter_(1, mask_inds, 1)
+
+        mask = torch.nn.functional.conv_transpose1d(inp, w, stride=mask_length).squeeze(
+            1
+        )
+        if mask.size(-1) > L:
+            mask = mask[..., :L]
+
+    else:
+        mask = torch.zeros((B, L))
+        mask_inds = torch.randint(
+            0,
+            L,
+            size=(
+                B,
+                int(
+                    L
+                    * ((mask_prob + mask_prob_adjust) / mask_length)
+                    * (1 + mask_dropout)
+                ),
+            ),
+        )
+
+        mask.view(B, -1).scatter_(1, mask_inds, 1)
+        centers = mask.nonzero(as_tuple=True)
+
+        inds = ([], [])
+
+        offset = mask_length // 2
+        for i in range(mask_length):
+            k1 = i - offset
+            inds[0].append(centers[0])
+            inds[1].append(centers[1] + k1)
+
+        i0 = torch.cat(inds[0])
+        i1 = torch.cat(inds[1]).clamp_(min=0, max=L - 1)
+
+        mask[(i0, i1)] = 1
+
+    def get_nbs(b, m, w):
+        all_nbs = torch.nn.functional.conv1d(m.unsqueeze(1), w, padding="same")
+        all_nbs = all_nbs.clamp_max_(1).view(b, -1)
+        return all_nbs
+
+    if require_same_masks and expand_adjcent:
+        w = torch.ones((1, 1, 3))
+        w[..., 1] = 0
+        all_nbs = get_nbs(B, mask, w)
+
+    mask = mask.view(B, -1)
+
+    if require_same_masks:
+        n_masks = mask.sum(dim=-1)
+        final_target_len = int(L * (mask_prob))
+        target_len = int(final_target_len * (1 + mask_dropout))
+
+        for i in range(len(mask)):
+            n = n_masks[i]
+            m = mask[i]
+            r = 0
+            while expand_adjcent and n < target_len:
+                if r == 0:
+                    nbs = all_nbs[i]
+                else:
+                    nbs = get_nbs(1, m.unsqueeze(0), w).squeeze(0)
+
+                cands = (1 - m + nbs) > 1
+                cand_sz = int(cands.sum().item())
+
+                assert cand_sz > 0, f"{nbs} {cand_sz}"
+
+                to_mask = torch.multinomial(
+                    cands.float(), min(cand_sz, int(target_len - n)), replacement=False
+                )
+                m[to_mask] = 1
+                assert to_mask.numel() > 0
+                n += to_mask.numel()
+                r += 1
+
+            if n > final_target_len:
+                to_unmask = torch.multinomial(
+                    m, int(n - final_target_len), replacement=False
+                )
+                m[to_unmask] = 0
+            elif n < final_target_len:
+                to_mask = torch.multinomial(
+                    (1 - m), int(final_target_len - n), replacement=False
+                )
+                m[to_mask] = 1
+
+    if inverse_mask:
+        mask = 1 - mask
+
+    return mask
+
+
+def get_mem_usage():
+    try:
+        import psutil
+
+        mb = 1024 * 1024
+        return f"used={psutil.virtual_memory().used / mb}Mb; avail={psutil.virtual_memory().available / mb}Mb"
+    except ImportError:
+        return "N/A"
+
+
+# lens: torch.LongTensor
+# returns: torch.BoolTensor
+def lengths_to_padding_mask(lens):
+    bsz, max_lens = lens.size(0), torch.max(lens).item()
+    mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
+    mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
+    return mask
+
+
+# lens: torch.LongTensor
+# returns: torch.BoolTensor
+def lengths_to_mask(lens):
+    return ~lengths_to_padding_mask(lens)
+
+
+def get_buckets(sizes, num_buckets):
+    buckets = np.unique(
+        np.percentile(
+            sizes,
+            np.linspace(0, 100, num_buckets + 1),
+            interpolation="lower",
+        )[1:]
+    )
+    return buckets
+
+
+def get_bucketed_sizes(orig_sizes, buckets):
+    sizes = np.copy(orig_sizes)
+    assert np.min(sizes) >= 0
+    start_val = -1
+    for end_val in buckets:
+        mask = (sizes > start_val) & (sizes <= end_val)
+        sizes[mask] = end_val
+        start_val = end_val
+    return sizes
+
+
+def _find_extra_valid_paths(dataset_path: str) -> set:
+    paths = utils.split_paths(dataset_path)
+    all_valid_paths = set()
+    for sub_dir in paths:
+        contents = PathManager.ls(sub_dir)
+        valid_paths = [c for c in contents if re.match("valid*[0-9].*", c) is not None]
+        all_valid_paths |= {os.path.basename(p) for p in valid_paths}
+    # Remove .bin, .idx etc
+    roots = {os.path.splitext(p)[0] for p in all_valid_paths}
+    return roots
+
+
+def raise_if_valid_subsets_unintentionally_ignored(train_cfg) -> None:
+    """Raises if there are paths matching 'valid*[0-9].*' which are not combined or ignored."""
+    if (
+        train_cfg.dataset.ignore_unused_valid_subsets
+        or train_cfg.dataset.combine_valid_subsets
+        or train_cfg.dataset.disable_validation
+        or not hasattr(train_cfg.task, "data")
+    ):
+        return
+    other_paths = _find_extra_valid_paths(train_cfg.task.data)
+    specified_subsets = train_cfg.dataset.valid_subset.split(",")
+    ignored_paths = [p for p in other_paths if p not in specified_subsets]
+    if ignored_paths:
+        advice = "Set --combine-val to combine them or --ignore-unused-valid-subsets to ignore them."
+        msg = f"Valid paths {ignored_paths} will be ignored. {advice}"
+        raise ValueError(msg)
+
+
+def compute_mask_indices_for_one(
+    sz,
+    mask_prob: float,
+    mask_length: int,
+    seed=None,
+    epoch=None,
+    index=None,
+    min_masks=0,
+):
+    """
+    set seed, epoch, index for deterministic masking
+    """
+    seed = int(hash((seed, epoch, index)) % 1e6) if seed else None
+    rng = np.random.default_rng(seed)
+
+    # decide elements to mask
+    mask = np.full(sz, False)
+    num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * sz / float(mask_length)
+        + rng.random()
+    )
+    num_mask = max(min_masks, num_mask)
+
+    # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453)
+    mask_idc = rng.choice(sz, num_mask, replace=False)
+    mask_idc = np.concatenate([mask_idc + i for i in range(mask_length)])
+    mask_idc = mask_idc[mask_idc < len(mask)]
+    try:
+        mask[mask_idc] = True
+    except:  # something wrong
+        print(f"Assigning mask indexes {mask_idc} to mask {mask} failed!")
+        raise
+
+    return mask
+
+
+def compute_mask_indices_v2(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    min_masks: int = 0,
+    require_same_masks: bool = True,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> np.ndarray:
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+        else:
+            sz = all_sz
+        index = indices[i].item() if indices is not None else None
+        mask_for_one = compute_mask_indices_for_one(
+            sz, mask_prob, mask_length, seed, epoch, index, min_masks
+        )
+        mask[i, :sz] = mask_for_one
+
+    if require_same_masks:
+        index_sum = indices.sum().item() if indices is not None else None
+        seed = int(hash((seed, epoch, index_sum)) % 1e6) if seed else None
+        rng = np.random.default_rng(seed)
+
+        num_mask = mask.sum(-1).min()
+        for i in range(bsz):
+            extra = mask[i].sum() - num_mask
+            if extra > 0:
+                to_unmask = rng.choice(np.nonzero(mask[i])[0], extra, replace=False)
+                mask[i, to_unmask] = False
+
+    return mask
+
+
+# TODO: a copy of the original compute_mask_indices
+def compute_mask_indices_v3(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * all_sz / float(mask_length)
+        + np.random.rand()
+    )
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+        rng = np.random.default_rng(seed_i)
+
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len and require_same_masks:
+            mask_idc = rng.choice(mask_idc, min_len, replace=False)
+        if mask_dropout > 0:
+            num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int)
+            mask_idc = rng.choice(mask_idc, len(mask_idc) - num_holes, replace=False)
+
+        mask[i, mask_idc] = True
+
+    return mask
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
new file mode 100644
index 0000000000..c61f31d6b2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
@@ -0,0 +1,178 @@
+# cython: language_level=3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+cimport cython
+cimport numpy as np
+
+from libc.stdint cimport int32_t, int64_t
+from libcpp cimport bool as bool_t
+
+ctypedef int64_t DTYPE_t
+
+@cython.cdivision(True)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef list batch_by_size_vec(
+    np.ndarray[int64_t, ndim=1] indices,
+    np.ndarray[int64_t, ndim=1] num_tokens_vec,
+    int64_t max_tokens,
+    int64_t max_sentences,
+    int32_t bsz_mult,
+):
+    if indices.shape[0] == 0:
+        return []
+
+    assert max_tokens <= 0 or np.max(num_tokens_vec) <= max_tokens, (
+        f"Sentences lengths should not exceed max_tokens={max_tokens}"
+    )
+
+    cdef int32_t indices_len = indices.shape[0]
+    cdef np.ndarray[int32_t, ndim=1] batches_ends = \
+            np.zeros(indices_len, dtype=np.int32)
+    cdef int32_t[:] batches_ends_view = batches_ends
+    cdef int64_t[:] num_tokens_view = num_tokens_vec
+
+    cdef int32_t pos = 0
+    cdef int32_t new_batch_end = 0
+
+    cdef int64_t new_batch_max_tokens = 0
+    cdef int32_t new_batch_sentences = 0
+    cdef int64_t new_batch_num_tokens = 0
+
+    cdef bool_t overflow = False
+    cdef bool_t size_matches_with_bsz_mult = False
+
+    cdef int32_t batches_count = 0
+    cdef int32_t batch_start = 0
+    cdef int64_t tail_max_tokens = 0
+    cdef int64_t batch_max_tokens = 0
+
+    for pos in range(indices_len):
+        # At every pos we keep stats about the last complete batch [batch_start:batch_end),
+        #      and tail [batch_end:pos].
+        # 1) Every time when (batch + tail) forms a valid batch
+        #      (according to max_tokens, max_sentences and bsz_mult) we append tail to batch.
+        # 2) When (batch+tail) violates max_tokens or max_sentences constraints
+        #      we finalize running batch, and tail becomes a new batch.
+        # 3) There is a corner case when tail also violates constraints.
+        #      In that situation [batch_end:pos-1] (tail without the current pos)
+        #      gets added to the finalized batches, while [pos:pos] becomes a new tail.
+        #
+        # Important: For the sake of performance try to avoid using function calls within this loop.
+
+        tail_max_tokens = tail_max_tokens \
+                            if tail_max_tokens > num_tokens_view[pos] \
+                            else num_tokens_view[pos]
+        new_batch_end = pos + 1
+        new_batch_max_tokens = batch_max_tokens \
+                                if batch_max_tokens > tail_max_tokens \
+                                else tail_max_tokens
+        new_batch_sentences = new_batch_end - batch_start
+        new_batch_num_tokens = new_batch_sentences * new_batch_max_tokens
+
+        overflow = (new_batch_sentences > max_sentences > 0 or
+                    new_batch_num_tokens > max_tokens > 0)
+        size_matches_with_bsz_mult = (new_batch_sentences < bsz_mult or
+                                      new_batch_sentences % bsz_mult == 0)
+
+        if overflow:
+            tail_num_tokens = tail_max_tokens * \
+                    (new_batch_end - batches_ends_view[batches_count])
+            tail_overflow = tail_num_tokens > max_tokens > 0
+            # In case of a tail overflow finalize two batches
+            if tail_overflow:
+                batches_count += 1
+                batches_ends_view[batches_count] = pos
+                tail_max_tokens = num_tokens_view[pos]
+            batch_start = batches_ends_view[batches_count]
+            batches_count += 1
+            new_batch_max_tokens = tail_max_tokens
+
+        if overflow or size_matches_with_bsz_mult:
+            batches_ends_view[batches_count] = new_batch_end
+            batch_max_tokens = new_batch_max_tokens
+            tail_max_tokens = 0
+    if batches_ends_view[batches_count] != indices_len:
+        batches_count += 1
+    # Memory and time-efficient split
+    return np.split(indices, batches_ends[:batches_count])
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef list batch_by_size_fn(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    int64_t max_tokens,
+    int64_t max_sentences,
+    int32_t bsz_mult,
+):
+    cdef int32_t indices_len = indices.shape[0]
+    cdef np.ndarray[int64_t, ndim=1] num_tokens_vec = np.zeros(indices_len,
+                                                               dtype=np.int64)
+    cdef DTYPE_t[:] indices_view = indices
+    cdef DTYPE_t[:] num_tokens_vec_view = num_tokens_vec
+    cdef int64_t pos
+    for pos in range(indices_len):
+        num_tokens_vec[pos] = num_tokens_fn(indices_view[pos])
+    return batch_by_size_vec(indices, num_tokens_vec, max_tokens,
+        max_sentences, bsz_mult,)
+
+
+cdef _find_valid_shape(
+    DTYPE_t[:, :] shapes_view,
+    int64_t num_sentences,
+    int64_t num_tokens,
+):
+    """Return index of first valid shape of -1 if none is found."""
+    for i in range(shapes_view.shape[0]):
+        if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
+            return i
+    return -1
+
+
+@cython.cdivision(True)
+cpdef list batch_fixed_shapes_fast(
+    np.ndarray[DTYPE_t, ndim=1] indices,
+    num_tokens_fn,
+    np.ndarray[DTYPE_t, ndim=2] fixed_shapes_sorted,
+):
+    cdef int64_t sample_len = 0
+    cdef list sample_lens = []
+    cdef list batch = []
+    cdef list batches = []
+    cdef int64_t mod_len
+    cdef int64_t i
+    cdef int64_t idx
+    cdef int64_t num_tokens
+    cdef DTYPE_t[:] indices_view = indices
+    cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted
+
+    for i in range(len(indices_view)):
+        idx = indices_view[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+
+        shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
+        if shape_idx == -1:
+            batches.append(batch)
+            batch = []
+            sample_lens = []
+            sample_len = 0
+            shapes_view = fixed_shapes_sorted
+        elif shape_idx > 0:
+            # small optimization for the next call to _find_valid_shape
+            shapes_view = shapes_view[shape_idx:]
+
+        batch.append(idx)
+
+    if len(batch) > 0:
+        batches.append(batch)
+
+    return batches
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
new file mode 100644
index 0000000000..a900fc6f96
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
@@ -0,0 +1,443 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+
+from . import FairseqDataset, data_utils
+
+
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    vocab,
+    left_pad_source=False,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+):
+    assert input_feeding
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=None,  # use eos_idx of each sample instead of vocab.eos()
+            left_pad=left_pad,
+            move_eos_to_beginning=move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+        )
+
+    id = torch.LongTensor([s["id"] for s in samples])
+    src_tokens = merge(
+        "source",
+        left_pad=left_pad_source,
+        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge(
+            "target",
+            left_pad=left_pad_target,
+            pad_to_length=pad_to_length["target"]
+            if pad_to_length is not None
+            else None,
+        )
+        target = target.index_select(0, sort_order)
+        ntokens = sum(len(s["target"]) for s in samples)
+
+        if input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                "target",
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length["target"]
+                if pad_to_length is not None
+                else None,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+    else:
+        ntokens = sum(len(s["source"]) for s in samples)
+
+    batch = {
+        "id": id,
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+        },
+        "target": target,
+        "nsentences": samples[0]["source"].size(0),
+        "sort_order": sort_order,
+    }
+    if prev_output_tokens is not None:
+        batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+
+    return batch
+
+
+class DenoisingDataset(FairseqDataset):
+    """
+    A wrapper around TokenBlockDataset for BART dataset.
+
+    Args:
+        dataset (TokenBlockDataset): dataset to wrap
+        sizes (List[int]): sentence lengths
+        vocab (~fairseq.data.Dictionary): vocabulary
+        mask_idx (int): dictionary index used for masked token
+        mask_whole_words: only mask whole words. This should be a byte mask
+            over vocab indices, indicating whether it is the beginning of a
+            word. We will extend any mask to encompass the whole word.
+        shuffle (bool, optional): shuffle the elements before batching.
+          Default: ``True``
+        seed: Seed for random number generator for reproducibility.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        vocab,
+        mask_idx,
+        mask_whole_words,
+        shuffle,
+        seed,
+        mask,
+        mask_random,
+        insert,
+        rotate,
+        permute_sentences,
+        bpe,
+        replace_length,
+        mask_length,
+        poisson_lambda,
+        eos=None,
+        item_transform_func=None,
+    ):
+        self.dataset = dataset
+
+        self.sizes = sizes
+
+        self.vocab = vocab
+        self.shuffle = shuffle
+        self.seed = seed
+        self.mask_idx = mask_idx
+        self.mask_whole_word = mask_whole_words
+        self.mask_ratio = mask
+        self.random_ratio = mask_random
+        self.insert_ratio = insert
+        self.rotate_ratio = rotate
+        self.permute_sentence_ratio = permute_sentences
+        self.eos = eos if eos is not None else vocab.eos()
+        self.item_transform_func = item_transform_func
+
+        if bpe != "gpt2":
+            self.full_stop_index = self.vocab.eos()
+        else:
+            assert bpe == "gpt2"
+            self.full_stop_index = self.vocab.index("13")
+
+        self.replace_length = replace_length
+        if self.replace_length not in [-1, 0, 1]:
+            raise ValueError(f"invalid arg: replace_length={self.replace_length}")
+        if mask_length not in ["subword", "word", "span-poisson"]:
+            raise ValueError(f"invalid arg: mask-length={mask_length}")
+        if mask_length == "subword" and replace_length not in [0, 1]:
+            raise ValueError(f"if using subwords, use replace-length=1 or 0")
+
+        self.mask_span_distribution = None
+        if mask_length == "span-poisson":
+            _lambda = poisson_lambda
+
+            lambda_to_the_k = 1
+            e_to_the_minus_lambda = math.exp(-_lambda)
+            k_factorial = 1
+            ps = []
+            for k in range(0, 128):
+                ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
+                lambda_to_the_k *= _lambda
+                k_factorial *= k + 1
+                if ps[-1] < 0.0000001:
+                    break
+            ps = torch.FloatTensor(ps)
+            self.mask_span_distribution = torch.distributions.Categorical(ps)
+
+        self.epoch = 0
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True  # only the noise changes, not item sizes
+
+    def set_epoch(self, epoch, **unused):
+        self.epoch = epoch
+
+    def __getitem__(self, index):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            tokens = self.dataset[index]
+            assert tokens[-1] == self.eos
+            source, target = tokens, tokens.clone()
+
+            if self.permute_sentence_ratio > 0.0:
+                source = self.permute_sentences(source, self.permute_sentence_ratio)
+
+            if self.mask_ratio > 0:
+                source = self.add_whole_word_mask(source, self.mask_ratio)
+
+            if self.insert_ratio > 0:
+                source = self.add_insertion_noise(source, self.insert_ratio)
+
+            if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio:
+                source = self.add_rolling_noise(source)
+        # there can additional changes to make:
+        if self.item_transform_func is not None:
+            source, target = self.item_transform_func(source, target)
+
+        assert (source >= 0).all()
+        assert (source[1:-1] >= 1).all()
+        assert (source <= len(self.vocab)).all()
+        assert source[0] == self.vocab.bos()
+        assert source[-1] == self.eos
+        return {
+            "id": index,
+            "source": source,
+            "target": target,
+        }
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def permute_sentences(self, source, p=1.0):
+        full_stops = source == self.full_stop_index
+        # Pretend it ends with a full stop so last span is a sentence
+        full_stops[-2] = 1
+
+        # Tokens that are full stops, where the previous token is not
+        sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2
+        result = source.clone()
+
+        num_sentences = sentence_ends.size(0)
+        num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
+        substitutions = torch.randperm(num_sentences)[:num_to_permute]
+        ordering = torch.arange(0, num_sentences)
+        ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]
+
+        # Ignore <bos> at start
+        index = 1
+        for i in ordering:
+            sentence = source[(sentence_ends[i - 1] if i > 0 else 1) : sentence_ends[i]]
+            result[index : index + sentence.size(0)] = sentence
+            index += sentence.size(0)
+        return result
+
+    def word_starts(self, source):
+        if self.mask_whole_word is not None:
+            is_word_start = self.mask_whole_word.gather(0, source)
+        else:
+            is_word_start = torch.ones(source.size())
+        is_word_start[0] = 0
+        is_word_start[-1] = 0
+        return is_word_start
+
+    def add_whole_word_mask(self, source, p):
+        is_word_start = self.word_starts(source)
+        num_to_mask = int(math.ceil(is_word_start.float().sum() * p))
+        num_inserts = 0
+        if num_to_mask == 0:
+            return source
+
+        if self.mask_span_distribution is not None:
+            lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,))
+
+            # Make sure we have enough to mask
+            cum_length = torch.cumsum(lengths, 0)
+            while cum_length[-1] < num_to_mask:
+                lengths = torch.cat(
+                    [
+                        lengths,
+                        self.mask_span_distribution.sample(sample_shape=(num_to_mask,)),
+                    ],
+                    dim=0,
+                )
+                cum_length = torch.cumsum(lengths, 0)
+
+            # Trim to masking budget
+            i = 0
+            while cum_length[i] < num_to_mask:
+                i += 1
+            lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
+            num_to_mask = i + 1
+            lengths = lengths[:num_to_mask]
+
+            # Handle 0-length mask (inserts) separately
+            lengths = lengths[lengths > 0]
+            num_inserts = num_to_mask - lengths.size(0)
+            num_to_mask -= num_inserts
+            if num_to_mask == 0:
+                return self.add_insertion_noise(source, num_inserts / source.size(0))
+
+            assert (lengths > 0).all()
+        else:
+            lengths = torch.ones((num_to_mask,)).long()
+        assert is_word_start[-1] == 0
+        word_starts = is_word_start.nonzero(as_tuple=False)
+        indices = word_starts[
+            torch.randperm(word_starts.size(0))[:num_to_mask]
+        ].squeeze(1)
+        mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio
+
+        source_length = source.size(0)
+        assert source_length - 1 not in indices
+        to_keep = torch.ones(source_length, dtype=torch.bool)
+        is_word_start[
+            -1
+        ] = 255  # acts as a long length, so spans don't go over the end of doc
+        if self.replace_length == 0:
+            to_keep[indices] = 0
+        else:
+            # keep index, but replace it with [MASK]
+            source[indices] = self.mask_idx
+            source[indices[mask_random]] = torch.randint(
+                1, len(self.vocab), size=(mask_random.sum(),)
+            )
+
+        if self.mask_span_distribution is not None:
+            assert len(lengths.size()) == 1
+            assert lengths.size() == indices.size()
+            lengths -= 1
+            while indices.size(0) > 0:
+                assert lengths.size() == indices.size()
+                lengths -= is_word_start[indices + 1].long()
+                uncompleted = lengths >= 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                lengths = lengths[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = torch.randint(
+                        1, len(self.vocab), size=(mask_random.sum(),)
+                    )
+        else:
+            # A bit faster when all lengths are 1
+            while indices.size(0) > 0:
+                uncompleted = is_word_start[indices + 1] == 0
+                indices = indices[uncompleted] + 1
+                mask_random = mask_random[uncompleted]
+                if self.replace_length != -1:
+                    # delete token
+                    to_keep[indices] = 0
+                else:
+                    # keep index, but replace it with [MASK]
+                    source[indices] = self.mask_idx
+                    source[indices[mask_random]] = torch.randint(
+                        1, len(self.vocab), size=(mask_random.sum(),)
+                    )
+
+                assert source_length - 1 not in indices
+
+        source = source[to_keep]
+
+        if num_inserts > 0:
+            source = self.add_insertion_noise(source, num_inserts / source.size(0))
+
+        return source
+
+    def add_permuted_noise(self, tokens, p):
+        num_words = len(tokens)
+        num_to_permute = math.ceil(((num_words * 2) * p) / 2.0)
+        substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1
+        tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]]
+        return tokens
+
+    def add_rolling_noise(self, tokens):
+        offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1)
+        tokens = torch.cat(
+            (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]),
+            dim=0,
+        )
+        return tokens
+
+    def add_insertion_noise(self, tokens, p):
+        if p == 0.0:
+            return tokens
+
+        num_tokens = len(tokens)
+        n = int(math.ceil(num_tokens * p))
+
+        noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1
+        noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool)
+        noise_mask[noise_indices] = 1
+        result = torch.LongTensor(n + len(tokens)).fill_(-1)
+
+        num_random = int(math.ceil(n * self.random_ratio))
+        result[noise_indices[num_random:]] = self.mask_idx
+        result[noise_indices[:num_random]] = torch.randint(
+            low=1, high=len(self.vocab), size=(num_random,)
+        )
+
+        result[~noise_mask] = tokens
+
+        assert (result >= 0).all()
+        return result
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch of data
+        """
+        return collate(
+            samples, self.vocab.pad(), self.eos, self.vocab, pad_to_length=pad_to_length
+        )
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self))
+        else:
+            indices = np.arange(len(self))
+        return indices[np.argsort(self.sizes[indices], kind="mergesort")]
+
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        self.tgt.prefetch(indices)
+
+    @property
+    def supports_prefetch(self):
+        return (
+            hasattr(self.src, "supports_prefetch")
+            and self.src.supports_prefetch
+            and hasattr(self.tgt, "supports_prefetch")
+            and self.tgt.supports_prefetch
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
new file mode 100644
index 0000000000..d6495389f0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
@@ -0,0 +1,401 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from collections import Counter
+from multiprocessing import Pool
+
+import torch
+from fairseq import utils
+from fairseq.data import data_utils
+from fairseq.file_chunker_utils import Chunker, find_offsets
+from fairseq.file_io import PathManager
+from fairseq.tokenizer import tokenize_line
+
+
+class Dictionary:
+    """A mapping from symbols to consecutive integers"""
+
+    def __init__(
+        self,
+        *,  # begin keyword-only arguments
+        bos="<s>",
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+        extra_special_symbols=None,
+    ):
+        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        self.bos_index = self.add_symbol(bos)
+        self.pad_index = self.add_symbol(pad)
+        self.eos_index = self.add_symbol(eos)
+        self.unk_index = self.add_symbol(unk)
+        if extra_special_symbols:
+            for s in extra_special_symbols:
+                self.add_symbol(s)
+        self.nspecial = len(self.symbols)
+
+    def __eq__(self, other):
+        return self.indices == other.indices
+
+    def __getitem__(self, idx):
+        if idx < len(self.symbols):
+            return self.symbols[idx]
+        return self.unk_word
+
+    def get_count(self, idx):
+        return self.count[idx]
+
+    def __len__(self):
+        """Returns the number of symbols in the dictionary"""
+        return len(self.symbols)
+
+    def __contains__(self, sym):
+        return sym in self.indices
+
+    def index(self, sym):
+        """Returns the index of the specified symbol"""
+        assert isinstance(sym, str)
+        if sym in self.indices:
+            return self.indices[sym]
+        return self.unk_index
+
+    def string(
+        self,
+        tensor,
+        bpe_symbol=None,
+        escape_unk=False,
+        extra_symbols_to_ignore=None,
+        unk_string=None,
+        include_eos=False,
+        separator=" ",
+    ):
+        """Helper for converting a tensor of token indices to a string.
+
+        Can optionally remove BPE symbols or escape <unk> words.
+        """
+        if torch.is_tensor(tensor) and tensor.dim() == 2:
+            return "\n".join(
+                self.string(
+                    t,
+                    bpe_symbol,
+                    escape_unk,
+                    extra_symbols_to_ignore,
+                    include_eos=include_eos,
+                )
+                for t in tensor
+            )
+
+        extra_symbols_to_ignore = set(extra_symbols_to_ignore or [])
+        if not include_eos:
+            extra_symbols_to_ignore.add(self.eos())
+
+        def token_string(i):
+            if i == self.unk():
+                if unk_string is not None:
+                    return unk_string
+                else:
+                    return self.unk_string(escape_unk)
+            else:
+                return self[i]
+
+        if hasattr(self, "bos_index"):
+            extra_symbols_to_ignore.add(self.bos())
+
+        sent = separator.join(
+            token_string(i)
+            for i in tensor
+            if utils.item(i) not in extra_symbols_to_ignore
+        )
+
+        return data_utils.post_process(sent, bpe_symbol)
+
+    def unk_string(self, escape=False):
+        """Return unknown string, optionally escaped as: <<unk>>"""
+        if escape:
+            return "<{}>".format(self.unk_word)
+        else:
+            return self.unk_word
+
+    def add_symbol(self, word, n=1, overwrite=False):
+        """Adds a word to the dictionary"""
+        if word in self.indices and not overwrite:
+            idx = self.indices[word]
+            self.count[idx] = self.count[idx] + n
+            return idx
+        else:
+            idx = len(self.symbols)
+            self.indices[word] = idx
+            self.symbols.append(word)
+            self.count.append(n)
+            return idx
+
+    def update(self, new_dict):
+        """Updates counts from new dictionary."""
+        for word in new_dict.symbols:
+            idx2 = new_dict.indices[word]
+            if word in self.indices:
+                idx = self.indices[word]
+                self.count[idx] = self.count[idx] + new_dict.count[idx2]
+            else:
+                idx = len(self.symbols)
+                self.indices[word] = idx
+                self.symbols.append(word)
+                self.count.append(new_dict.count[idx2])
+
+    def finalize(self, threshold=-1, nwords=-1, padding_factor=8):
+        """Sort symbols by frequency in descending order, ignoring special ones.
+
+        Args:
+            - threshold defines the minimum word count
+            - nwords defines the total number of words in the final dictionary,
+                including special symbols
+            - padding_factor can be used to pad the dictionary size to be a
+                multiple of 8, which is important on some hardware (e.g., Nvidia
+                Tensor Cores).
+        """
+        if nwords <= 0:
+            nwords = len(self)
+
+        new_indices = dict(zip(self.symbols[: self.nspecial], range(self.nspecial)))
+        new_symbols = self.symbols[: self.nspecial]
+        new_count = self.count[: self.nspecial]
+
+        c = Counter(
+            dict(
+                sorted(zip(self.symbols[self.nspecial :], self.count[self.nspecial :]))
+            )
+        )
+        for symbol, count in c.most_common(nwords - self.nspecial):
+            if count >= threshold:
+                new_indices[symbol] = len(new_symbols)
+                new_symbols.append(symbol)
+                new_count.append(count)
+            else:
+                break
+
+        assert len(new_symbols) == len(new_indices)
+
+        self.count = list(new_count)
+        self.symbols = list(new_symbols)
+        self.indices = new_indices
+
+        self.pad_to_multiple_(padding_factor)
+
+    def pad_to_multiple_(self, padding_factor):
+        """Pad Dictionary size to be a multiple of *padding_factor*."""
+        if padding_factor > 1:
+            i = 0
+            while len(self) % padding_factor != 0:
+                symbol = "madeupword{:04d}".format(i)
+                self.add_symbol(symbol, n=0)
+                i += 1
+
+    def bos(self):
+        """Helper to get index of beginning-of-sentence symbol"""
+        return self.bos_index
+
+    def pad(self):
+        """Helper to get index of pad symbol"""
+        return self.pad_index
+
+    def eos(self):
+        """Helper to get index of end-of-sentence symbol"""
+        return self.eos_index
+
+    def unk(self):
+        """Helper to get index of unk symbol"""
+        return self.unk_index
+
+    @classmethod
+    def load(cls, f):
+        """Loads the dictionary from a text file with the format:
+
+        ```
+        <symbol0> <count0>
+        <symbol1> <count1>
+        ...
+        ```
+        """
+        d = cls()
+        d.add_from_file(f)
+        return d
+
+    def add_from_file(self, f):
+        """
+        Loads a pre-existing dictionary from a text file and adds its symbols
+        to this instance.
+        """
+        if isinstance(f, str):
+            try:
+                with open(PathManager.get_local_path(f), "r", encoding="utf-8") as fd:
+                    self.add_from_file(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except UnicodeError:
+                raise Exception(
+                    "Incorrect encoding detected in {}, please "
+                    "rebuild the dataset".format(f)
+                )
+            return
+
+        lines = f.readlines()
+        indices_start_line = self._load_meta(lines)
+
+        for line in lines[indices_start_line:]:
+            try:
+                line, field = line.rstrip().rsplit(" ", 1)
+                if field == "#fairseq:overwrite":
+                    overwrite = True
+                    line, field = line.rsplit(" ", 1)
+                else:
+                    overwrite = False
+                count = int(field)
+                word = line
+                if word in self and not overwrite:
+                    raise RuntimeError(
+                        "Duplicate word found when loading Dictionary: '{}'. "
+                        "Duplicate words can overwrite earlier ones by adding the "
+                        "#fairseq:overwrite flag at the end of the corresponding row "
+                        "in the dictionary file. If using the Camembert model, please "
+                        "download an updated copy of the model file.".format(word)
+                    )
+                self.add_symbol(word, n=count, overwrite=overwrite)
+            except ValueError:
+                raise ValueError(
+                    f"Incorrect dictionary format, expected '<token> <cnt> [flags]': \"{line}\""
+                )
+
+    def _save(self, f, kv_iterator):
+        if isinstance(f, str):
+            PathManager.mkdirs(os.path.dirname(f))
+            with PathManager.open(f, "w", encoding="utf-8") as fd:
+                return self.save(fd)
+        for k, v in kv_iterator:
+            print("{} {}".format(k, v), file=f)
+
+    def _get_meta(self):
+        return [], []
+
+    def _load_meta(self, lines):
+        return 0
+
+    def save(self, f):
+        """Stores dictionary into a text file"""
+        ex_keys, ex_vals = self._get_meta()
+        self._save(
+            f,
+            zip(
+                ex_keys + self.symbols[self.nspecial :],
+                ex_vals + self.count[self.nspecial :],
+            ),
+        )
+
+    def dummy_sentence(self, length):
+        t = torch.Tensor(length).uniform_(self.nspecial + 1, len(self)).long()
+        t[-1] = self.eos()
+        return t
+
+    def encode_line(
+        self,
+        line,
+        line_tokenizer=tokenize_line,
+        add_if_not_exist=True,
+        consumer=None,
+        append_eos=True,
+        reverse_order=False,
+    ) -> torch.IntTensor:
+        words = line_tokenizer(line)
+        if reverse_order:
+            words = list(reversed(words))
+        nwords = len(words)
+        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
+
+        for i, word in enumerate(words):
+            if add_if_not_exist:
+                idx = self.add_symbol(word)
+            else:
+                idx = self.index(word)
+            if consumer is not None:
+                consumer(word, idx)
+            ids[i] = idx
+        if append_eos:
+            ids[nwords] = self.eos_index
+        return ids
+
+    @staticmethod
+    def _add_file_to_dictionary_single_worker(
+        filename,
+        tokenize,
+        eos_word,
+        start_offset,
+        end_offset,
+    ):
+        counter = Counter()
+        with Chunker(filename, start_offset, end_offset) as line_iterator:
+            for line in line_iterator:
+                for word in tokenize(line):
+                    counter.update([word])
+                counter.update([eos_word])
+        return counter
+
+    @staticmethod
+    def add_file_to_dictionary(filename, dict, tokenize, num_workers):
+        def merge_result(counter):
+            for w, c in sorted(counter.items()):
+                dict.add_symbol(w, c)
+
+        local_file = PathManager.get_local_path(filename)
+        offsets = find_offsets(local_file, num_workers)
+        if num_workers > 1:
+            chunks = zip(offsets, offsets[1:])
+            pool = Pool(processes=num_workers)
+            results = []
+            for (start_offset, end_offset) in chunks:
+                results.append(
+                    pool.apply_async(
+                        Dictionary._add_file_to_dictionary_single_worker,
+                        (
+                            local_file,
+                            tokenize,
+                            dict.eos_word,
+                            start_offset,
+                            end_offset,
+                        ),
+                    )
+                )
+            pool.close()
+            pool.join()
+            for r in results:
+                merge_result(r.get())
+        else:
+            merge_result(
+                Dictionary._add_file_to_dictionary_single_worker(
+                    local_file, tokenize, dict.eos_word, offsets[0], offsets[1]
+                )
+            )
+
+
+class TruncatedDictionary(object):
+    def __init__(self, wrapped_dict, length):
+        self.__class__ = type(
+            wrapped_dict.__class__.__name__,
+            (self.__class__, wrapped_dict.__class__),
+            {},
+        )
+        self.__dict__ = wrapped_dict.__dict__
+        self.wrapped_dict = wrapped_dict
+        self.length = min(len(self.wrapped_dict), length)
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, i):
+        if i < self.length:
+            return self.wrapped_dict[i]
+        return self.wrapped_dict.unk()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
new file mode 100644
index 0000000000..7cbe00a105
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import importlib
+import os
+
+from fairseq import registry
+
+
+build_tokenizer, register_tokenizer, TOKENIZER_REGISTRY, _ = registry.setup_registry(
+    "--tokenizer",
+    default=None,
+)
+
+
+build_bpe, register_bpe, BPE_REGISTRY, _ = registry.setup_registry(
+    "--bpe",
+    default=None,
+)
+
+
+# automatically import any Python files in the encoders/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("fairseq.data.encoders." + module)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
new file mode 100644
index 0000000000..31e3a06278
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass, field
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+from fairseq.data.encoders.byte_utils import (
+    SPACE,
+    SPACE_ESCAPE,
+    byte_encode,
+    smart_byte_decode,
+)
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class ByteBpeConfig(FairseqDataclass):
+    sentencepiece_model_path: str = field(
+        default="???", metadata={"help": "path to sentencepiece model"}
+    )
+
+
+@register_bpe("byte_bpe", dataclass=ByteBpeConfig)
+class ByteBPE(object):
+    def __init__(self, cfg):
+        vocab = file_utils.cached_path(cfg.sentencepiece_model_path)
+        try:
+            import sentencepiece as spm
+
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(vocab)
+        except ImportError:
+            raise ImportError(
+                "Please install sentencepiece with: pip install sentencepiece"
+            )
+
+    def encode(self, x: str) -> str:
+        byte_encoded = byte_encode(x)
+        return SPACE.join(self.sp.EncodeAsPieces(byte_encoded))
+
+    @staticmethod
+    def decode(x: str) -> str:
+        unescaped = x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE)
+        return smart_byte_decode(unescaped)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
new file mode 100644
index 0000000000..a305c08092
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+
+WHITESPACE_NORMALIZER = re.compile(r"\s+")
+SPACE = chr(32)
+SPACE_ESCAPE = chr(9601)
+# excluding non-breaking space (160) here
+PRINTABLE_LATIN = set(
+    list(range(32, 126 + 1)) + list(range(161, 172 + 1)) + list(range(174, 255 + 1))
+)
+BYTE_TO_BCHAR = {
+    b: chr(b) if b in PRINTABLE_LATIN else chr(256 + b) for b in range(256)
+}
+BCHAR_TO_BYTE = {bc: b for b, bc in BYTE_TO_BCHAR.items()}
+
+
+def byte_encode(x: str) -> str:
+    normalized = WHITESPACE_NORMALIZER.sub(SPACE, x)
+    return "".join([BYTE_TO_BCHAR[b] for b in normalized.encode("utf-8")])
+
+
+def byte_decode(x: str) -> str:
+    try:
+        return bytes([BCHAR_TO_BYTE[bc] for bc in x]).decode("utf-8")
+    except ValueError:
+        return ""
+
+
+def smart_byte_decode(x: str) -> str:
+    output = byte_decode(x)
+    if output == "":
+        # DP the best recovery (max valid chars) if it's broken
+        n_bytes = len(x)
+        f = [0 for _ in range(n_bytes + 1)]
+        pt = [0 for _ in range(n_bytes + 1)]
+        for i in range(1, n_bytes + 1):
+            f[i], pt[i] = f[i - 1], i - 1
+            for j in range(1, min(4, i) + 1):
+                if f[i - j] + 1 > f[i] and len(byte_decode(x[i - j : i])) > 0:
+                    f[i], pt[i] = f[i - j] + 1, i - j
+        cur_pt = n_bytes
+        while cur_pt > 0:
+            if f[cur_pt] == f[pt[cur_pt]] + 1:
+                output = byte_decode(x[pt[cur_pt] : cur_pt]) + output
+            cur_pt = pt[cur_pt]
+    return output
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
new file mode 100644
index 0000000000..f88f8f6929
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
@@ -0,0 +1,34 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.data.encoders import register_bpe
+from fairseq.data.encoders.byte_utils import (
+    SPACE,
+    SPACE_ESCAPE,
+    byte_encode,
+    smart_byte_decode,
+)
+
+
+@register_bpe("bytes")
+class Bytes(object):
+    def __init__(self, *unused):
+        pass
+
+    @staticmethod
+    def add_args(parser):
+        pass
+
+    @staticmethod
+    def encode(x: str) -> str:
+        encoded = byte_encode(x)
+        escaped = encoded.replace(SPACE, SPACE_ESCAPE)
+        return SPACE.join(list(escaped))
+
+    @staticmethod
+    def decode(x: str) -> str:
+        unescaped = x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE)
+        return smart_byte_decode(unescaped)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
new file mode 100644
index 0000000000..494ea21939
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.data.encoders import register_bpe
+
+
+SPACE = chr(32)
+SPACE_ESCAPE = chr(9601)
+
+
+@register_bpe("characters")
+class Characters(object):
+    def __init__(self, *unused):
+        pass
+
+    @staticmethod
+    def add_args(parser):
+        pass
+
+    @staticmethod
+    def encode(x: str) -> str:
+        escaped = x.replace(SPACE, SPACE_ESCAPE)
+        return SPACE.join(list(escaped))
+
+    @staticmethod
+    def decode(x: str) -> str:
+        return x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
new file mode 100644
index 0000000000..f7c2103954
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class fastBPEConfig(FairseqDataclass):
+    bpe_codes: str = field(default="???", metadata={"help": "path to fastBPE BPE"})
+
+
+@register_bpe("fastbpe", dataclass=fastBPEConfig)
+class fastBPE(object):
+    def __init__(self, cfg):
+        if cfg.bpe_codes is None:
+            raise ValueError("--bpe-codes is required for --bpe=fastbpe")
+        codes = file_utils.cached_path(cfg.bpe_codes)
+        try:
+            import fastBPE
+
+            self.bpe = fastBPE.fastBPE(codes)
+            self.bpe_symbol = "@@ "
+        except ImportError:
+            raise ImportError("Please install fastBPE with: pip install fastBPE")
+
+    def encode(self, x: str) -> str:
+        return self.bpe.apply([x])[0]
+
+    def decode(self, x: str) -> str:
+        return (x + " ").replace(self.bpe_symbol, "").rstrip()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
new file mode 100644
index 0000000000..e661426a73
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
@@ -0,0 +1,45 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+from fairseq.dataclass import FairseqDataclass
+
+from .gpt2_bpe_utils import get_encoder
+
+
+DEFAULT_ENCODER_JSON = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json"
+DEFAULT_VOCAB_BPE = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe"
+
+
+@dataclass
+class GPT2BPEConfig(FairseqDataclass):
+    gpt2_encoder_json: str = field(
+        default=DEFAULT_ENCODER_JSON, metadata={"help": "path to encoder.json"}
+    )
+    gpt2_vocab_bpe: str = field(
+        default=DEFAULT_VOCAB_BPE, metadata={"help": "path to vocab.bpe"}
+    )
+
+
+@register_bpe("gpt2", dataclass=GPT2BPEConfig)
+class GPT2BPE(object):
+    def __init__(self, cfg):
+        encoder_json = file_utils.cached_path(cfg.gpt2_encoder_json)
+        vocab_bpe = file_utils.cached_path(cfg.gpt2_vocab_bpe)
+        self.bpe = get_encoder(encoder_json, vocab_bpe)
+
+    def encode(self, x: str) -> str:
+        return " ".join(map(str, self.bpe.encode(x)))
+
+    def decode(self, x: str) -> str:
+        return self.bpe.decode(
+            [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()]
+        )
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return self.decode(x).startswith(" ")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py
new file mode 100644
index 0000000000..996d3d4a11
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py
@@ -0,0 +1,140 @@
+"""
+Byte pair encoding utilities from GPT-2.
+
+Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Original license: MIT
+"""
+
+import json
+from functools import lru_cache
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Encoder:
+    def __init__(self, encoder, bpe_merges, errors="replace"):
+        self.encoder = encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        try:
+            import regex as re
+
+            self.re = re
+        except ImportError:
+            raise ImportError("Please install regex with: pip install regex")
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = self.re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        for token in self.re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(
+                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
+            )
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = "".join([self.decoder.get(token, token) for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            "utf-8", errors=self.errors
+        )
+        return text
+
+
+def get_encoder(encoder_json_path, vocab_bpe_path):
+    with open(encoder_json_path, "r") as f:
+        encoder = json.load(f)
+    with open(vocab_bpe_path, "r", encoding="utf-8") as f:
+        bpe_data = f.read()
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=bpe_merges,
+    )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
new file mode 100644
index 0000000000..a41c059343
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from fairseq.data.encoders import register_bpe
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class BertBPEConfig(FairseqDataclass):
+    bpe_cased: bool = field(default=False, metadata={"help": "set for cased BPE"})
+    bpe_vocab_file: Optional[str] = field(
+        default=None, metadata={"help": "bpe vocab file"}
+    )
+
+
+@register_bpe("bert", dataclass=BertBPEConfig)
+class BertBPE(object):
+    def __init__(self, cfg):
+        try:
+            from transformers import BertTokenizer
+        except ImportError:
+            raise ImportError(
+                "Please install transformers with: pip install transformers"
+            )
+
+        if cfg.bpe_vocab_file:
+            self.bert_tokenizer = BertTokenizer(
+                cfg.bpe_vocab_file, do_lower_case=not cfg.bpe_cased
+            )
+        else:
+            vocab_file_name = (
+                "bert-base-cased" if cfg.bpe_cased else "bert-base-uncased"
+            )
+            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
+
+    def encode(self, x: str) -> str:
+        return " ".join(self.bert_tokenizer.tokenize(x))
+
+    def decode(self, x: str) -> str:
+        return self.bert_tokenizer.clean_up_tokenization(
+            self.bert_tokenizer.convert_tokens_to_string(x.split(" "))
+        )
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return not x.startswith("##")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
new file mode 100644
index 0000000000..c508578d41
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from fairseq.data.encoders import register_bpe
+from fairseq.dataclass import FairseqDataclass
+from fairseq import file_utils
+
+
+@dataclass
+class HuggingFaceByteLevelBPEConfig(FairseqDataclass):
+    bpe_merges: str = field(default="???", metadata={"help": "path to merges.txt"})
+    bpe_vocab: str = field(default="???", metadata={"help": "path to vocab.json"})
+    bpe_add_prefix_space: bool = field(
+        default=False, metadata={"help": "add prefix space before encoding"}
+    )
+
+
+@register_bpe("hf_byte_bpe", dataclass=HuggingFaceByteLevelBPEConfig)
+class HuggingFaceByteLevelBPE(object):
+    def __init__(self, cfg):
+        try:
+            from tokenizers import ByteLevelBPETokenizer
+        except ImportError:
+            raise ImportError(
+                "Please install huggingface/tokenizers with: " "pip install tokenizers"
+            )
+
+        bpe_vocab = file_utils.cached_path(cfg.bpe_vocab)
+        bpe_merges = file_utils.cached_path(cfg.bpe_merges)
+
+        self.bpe = ByteLevelBPETokenizer(
+            bpe_vocab,
+            bpe_merges,
+            add_prefix_space=cfg.bpe_add_prefix_space,
+        )
+
+    def encode(self, x: str) -> str:
+        return " ".join(map(str, self.bpe.encode(x).ids))
+
+    def decode(self, x: str) -> str:
+        return self.bpe.decode(
+            [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()]
+        )
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        return self.decode(x).startswith(" ")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
new file mode 100644
index 0000000000..e236dad167
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from fairseq.data.encoders import register_tokenizer
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class MosesTokenizerConfig(FairseqDataclass):
+    source_lang: str = field(default="en", metadata={"help": "source language"})
+    target_lang: str = field(default="en", metadata={"help": "target language"})
+    moses_no_dash_splits: bool = field(
+        default=False, metadata={"help": "don't apply dash split rules"}
+    )
+    moses_no_escape: bool = field(
+        default=False,
+        metadata={"help": "don't perform HTML escaping on apostrophe, quotes, etc."},
+    )
+
+
+@register_tokenizer("moses", dataclass=MosesTokenizerConfig)
+class MosesTokenizer(object):
+    def __init__(self, cfg: MosesTokenizerConfig):
+        self.cfg = cfg
+
+        try:
+            from sacremoses import MosesTokenizer, MosesDetokenizer
+
+            self.tok = MosesTokenizer(cfg.source_lang)
+            self.detok = MosesDetokenizer(cfg.target_lang)
+        except ImportError:
+            raise ImportError(
+                "Please install Moses tokenizer with: pip install sacremoses"
+            )
+
+    def encode(self, x: str) -> str:
+        return self.tok.tokenize(
+            x,
+            aggressive_dash_splits=(not self.cfg.moses_no_dash_splits),
+            return_str=True,
+            escape=(not self.cfg.moses_no_escape),
+        )
+
+    def decode(self, x: str) -> str:
+        return self.detok.detokenize(x.split())
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
new file mode 100644
index 0000000000..0ab92377b3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data.encoders import register_tokenizer
+from fairseq.dataclass import FairseqDataclass
+
+
+@register_tokenizer("nltk", dataclass=FairseqDataclass)
+class NLTKTokenizer(object):
+    def __init__(self, *unused):
+        try:
+            from nltk.tokenize import word_tokenize
+
+            self.word_tokenize = word_tokenize
+        except ImportError:
+            raise ImportError("Please install nltk with: pip install nltk")
+
+    def encode(self, x: str) -> str:
+        return " ".join(self.word_tokenize(x))
+
+    def decode(self, x: str) -> str:
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
new file mode 100644
index 0000000000..0aa6cd7681
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
@@ -0,0 +1,65 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class SentencepieceConfig(FairseqDataclass):
+    sentencepiece_model: str = field(
+        default="???", metadata={"help": "path to sentencepiece model"}
+    )
+    sentencepiece_enable_sampling: bool = field(
+        default=False, metadata={"help": "enable sampling"}
+    )
+    sentencepiece_alpha: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "soothing parameter for unigram sampling, "
+            "and merge probability for BPE-dropout"
+        },
+    )
+
+
+@register_bpe("sentencepiece", dataclass=SentencepieceConfig)
+class SentencepieceBPE(object):
+    def __init__(self, cfg):
+        self.enable_sampling = cfg.sentencepiece_enable_sampling
+        self.alpha = cfg.sentencepiece_alpha
+        sentencepiece_model = file_utils.cached_path(cfg.sentencepiece_model)
+        try:
+            import sentencepiece as spm
+
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(sentencepiece_model)
+        except ImportError:
+            raise ImportError(
+                "Please install sentencepiece with: pip install sentencepiece"
+            )
+
+    def encode(self, x: str) -> str:
+        return " ".join(
+            self.sp.Encode(
+                x, out_type=str, enable_sampling=self.enable_sampling, alpha=self.alpha
+            )
+        )
+
+    def decode(self, x: str) -> str:
+        return x.replace(" ", "").replace("\u2581", " ").strip()
+
+    def is_beginning_of_word(self, x: str) -> bool:
+        if x in ["<unk>", "<s>", "</s>", "<pad>"]:
+            # special elements are always considered beginnings
+            # HACK: this logic is already present in fairseq/tasks/masked_lm.py
+            # but these special tokens are also contained in the sentencepiece
+            # vocabulary which causes duplicate special tokens. This hack makes
+            # sure that they are all taken into account.
+            return True
+        return x.startswith("\u2581")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
new file mode 100644
index 0000000000..925ad41b7c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+from fairseq.data.encoders import register_tokenizer
+from fairseq.dataclass import FairseqDataclass
+
+
+@register_tokenizer("space", dataclass=FairseqDataclass)
+class SpaceTokenizer(object):
+    def __init__(self, *unused):
+        self.space_tok = re.compile(r"\s+")
+
+    def encode(self, x: str) -> str:
+        return self.space_tok.sub(" ", x)
+
+    def decode(self, x: str) -> str:
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
new file mode 100644
index 0000000000..5d724d2730
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from fairseq import file_utils
+from fairseq.data.encoders import register_bpe
+from fairseq.dataclass import FairseqDataclass
+
+
+@dataclass
+class SubwordNMTBPEConfig(FairseqDataclass):
+    bpe_codes: str = field(default="???", metadata={"help": "path to subword NMT BPE"})
+    bpe_separator: str = field(default="@@", metadata={"help": "BPE separator"})
+
+
+@register_bpe("subword_nmt", dataclass=SubwordNMTBPEConfig)
+class SubwordNMTBPE(object):
+    def __init__(self, cfg):
+        if cfg.bpe_codes is None:
+            raise ValueError("--bpe-codes is required for --bpe=subword_nmt")
+        codes = file_utils.cached_path(cfg.bpe_codes)
+        try:
+            from subword_nmt import apply_bpe
+
+            bpe_parser = apply_bpe.create_parser()
+            bpe_args = bpe_parser.parse_args(
+                [
+                    "--codes",
+                    codes,
+                    "--separator",
+                    cfg.bpe_separator,
+                ]
+            )
+            self.bpe = apply_bpe.BPE(
+                bpe_args.codes,
+                bpe_args.merges,
+                bpe_args.separator,
+                None,
+                bpe_args.glossaries,
+            )
+            self.bpe_symbol = bpe_args.separator + " "
+        except ImportError:
+            raise ImportError(
+                "Please install subword_nmt with: pip install subword-nmt"
+            )
+
+    def encode(self, x: str) -> str:
+        return self.bpe.process_line(x)
+
+    def decode(self, x: str) -> str:
+        return (x + " ").replace(self.bpe_symbol, "").rstrip()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
new file mode 100644
index 0000000000..d93eb532ef
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.data import encoders
+
+
+def get_whole_word_mask(args, dictionary):
+    bpe = encoders.build_bpe(args)
+    if bpe is not None:
+
+        def is_beginning_of_word(i):
+            if i < dictionary.nspecial:
+                # special elements are always considered beginnings
+                return True
+            tok = dictionary[i]
+            if tok.startswith("madeupword"):
+                return True
+            try:
+                return bpe.is_beginning_of_word(tok)
+            except ValueError:
+                return True
+
+        mask_whole_words = torch.ByteTensor(
+            list(map(is_beginning_of_word, range(len(dictionary))))
+        )
+        return mask_whole_words
+    return None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
new file mode 100644
index 0000000000..2bde7fc57b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
@@ -0,0 +1,205 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import numpy as np
+import torch.utils.data
+from fairseq.data import data_utils
+
+logger = logging.getLogger(__name__)
+
+
+class EpochListening:
+    """Mixin for receiving updates whenever the epoch increments."""
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        """
+        Whether we can reuse the :class:`fairseq.data.EpochBatchIterator` for
+        this dataset across epochs.
+
+        This needs to return ``False`` if the sample sizes can change across
+        epochs, in which case we may need to regenerate batches at each epoch.
+        If your dataset relies in ``set_epoch`` then you should consider setting
+        this to ``False``.
+        """
+        return True
+
+    def set_epoch(self, epoch):
+        """Will receive the updated epoch number at the beginning of the epoch."""
+        pass
+
+
+class FairseqDataset(torch.utils.data.Dataset, EpochListening):
+    """A dataset that provides helpers for batching."""
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        raise NotImplementedError
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        raise NotImplementedError
+
+    def num_tokens_vec(self, indices):
+        """Return the number of tokens for a set of positions defined by indices.
+        This value is used to enforce ``--max-tokens`` during batching."""
+        raise NotImplementedError
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        raise NotImplementedError
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self), dtype=np.int64)
+
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return False
+
+    def attr(self, attr: str, index: int):
+        return getattr(self, attr, None)
+
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        raise NotImplementedError
+
+    def get_batch_shapes(self):
+        """
+        Return a list of valid batch shapes, for example::
+
+            [(8, 512), (16, 256), (32, 128)]
+
+        The first dimension of each tuple is the batch size and can be ``None``
+        to automatically infer the max batch size based on ``--max-tokens``.
+        The second dimension of each tuple is the max supported length as given
+        by :func:`fairseq.data.FairseqDataset.num_tokens`.
+
+        This will be used by :func:`fairseq.data.FairseqDataset.batch_by_size`
+        to restrict batch shapes. This is useful on TPUs to avoid too many
+        dynamic shapes (and recompilations).
+        """
+        return None
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        """
+        Given an ordered set of indices, return batches according to
+        *max_tokens*, *max_sentences* and *required_batch_size_multiple*.
+        """
+        from fairseq.data import data_utils
+
+        fixed_shapes = self.get_batch_shapes()
+        if fixed_shapes is not None:
+
+            def adjust_bsz(bsz, num_tokens):
+                if bsz is None:
+                    assert max_tokens is not None, "Must specify --max-tokens"
+                    bsz = max_tokens // num_tokens
+                if max_sentences is not None:
+                    bsz = min(bsz, max_sentences)
+                elif (
+                    bsz >= required_batch_size_multiple
+                    and bsz % required_batch_size_multiple != 0
+                ):
+                    bsz -= bsz % required_batch_size_multiple
+                return bsz
+
+            fixed_shapes = np.array(
+                [
+                    [adjust_bsz(bsz, num_tokens), num_tokens]
+                    for (bsz, num_tokens) in fixed_shapes
+                ]
+            )
+
+        try:
+            num_tokens_vec = self.num_tokens_vec(indices).astype("int64")
+        except NotImplementedError:
+            num_tokens_vec = None
+
+        return data_utils.batch_by_size(
+            indices,
+            num_tokens_fn=self.num_tokens,
+            num_tokens_vec=num_tokens_vec,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+            fixed_shapes=fixed_shapes,
+        )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """
+        Filter a list of sample indices. Remove those that are longer than
+        specified in *max_sizes*.
+
+        WARNING: don't update, override method in child classes
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        if isinstance(max_sizes, float) or isinstance(max_sizes, int):
+            if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray):
+                ignored = indices[self.sizes[indices] > max_sizes].tolist()
+                indices = indices[self.sizes[indices] <= max_sizes]
+            elif (
+                hasattr(self, "sizes")
+                and isinstance(self.sizes, list)
+                and len(self.sizes) == 1
+            ):
+                ignored = indices[self.sizes[0][indices] > max_sizes].tolist()
+                indices = indices[self.sizes[0][indices] <= max_sizes]
+            else:
+                indices, ignored = data_utils._filter_by_size_dynamic(
+                    indices, self.size, max_sizes
+                )
+        else:
+            indices, ignored = data_utils._filter_by_size_dynamic(
+                indices, self.size, max_sizes
+            )
+        return indices, ignored
+
+    @property
+    def supports_fetch_outside_dataloader(self):
+        """Whether this dataset supports fetching outside the workers of the dataloader."""
+        return True
+
+
+class FairseqIterableDataset(torch.utils.data.IterableDataset, EpochListening):
+    """
+    For datasets that need to be read sequentially, usually because the data is
+    being streamed or otherwise can't be manipulated on a single machine.
+    """
+
+    def __iter__(self):
+        raise NotImplementedError
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
new file mode 100644
index 0000000000..007011974a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import subprocess
+import threading
+from pathlib import Path
+
+import numpy as np
+import torch
+
+
+def fasta_file_path(prefix_path):
+    return prefix_path + ".fasta"
+
+
+class FastaDataset(torch.utils.data.Dataset):
+    """
+    For loading protein sequence datasets in the common FASTA data format
+    """
+
+    def __init__(self, path: str, cache_indices=False):
+        self.fn = fasta_file_path(path)
+        self.threadlocal = threading.local()
+        self.cache = Path(f"{path}.fasta.idx.npy")
+        if cache_indices:
+            if self.cache.exists():
+                self.offsets, self.sizes = np.load(self.cache)
+            else:
+                self.offsets, self.sizes = self._build_index(path)
+                np.save(self.cache, np.stack([self.offsets, self.sizes]))
+        else:
+            self.offsets, self.sizes = self._build_index(path)
+
+    def _get_file(self):
+        if not hasattr(self.threadlocal, "f"):
+            self.threadlocal.f = open(self.fn, "r")
+        return self.threadlocal.f
+
+    def __getitem__(self, idx):
+        f = self._get_file()
+        f.seek(self.offsets[idx])
+        desc = f.readline().strip()
+        line = f.readline()
+        seq = ""
+        while line != "" and line[0] != ">":
+            seq += line.strip()
+            line = f.readline()
+        return desc, seq
+
+    def __len__(self):
+        return self.offsets.size
+
+    def _build_index(self, path: str):
+        # Use grep and awk to get 100M/s on local SSD.
+        # Should process your enormous 100G fasta in ~10 min single core...
+        path = fasta_file_path(path)
+        bytes_offsets = subprocess.check_output(
+            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
+            "| grep --byte-offset '^>' -o | cut -d: -f1",
+            shell=True,
+        )
+        fasta_lengths = subprocess.check_output(
+            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
+            "| awk '/^>/ {print \"\";next;} { printf(\"%s\",$0);}' | tail -n+2 | awk '{print length($1)}'",
+            shell=True,
+        )
+        bytes_np = np.fromstring(bytes_offsets, dtype=np.int64, sep=" ")
+        sizes_np = np.fromstring(fasta_lengths, dtype=np.int64, sep=" ")
+        return bytes_np, sizes_np
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.threadlocal = threading.local()
+
+    def __getstate__(self):
+        d = {}
+        for i, v in self.__dict__.items():
+            if i != "threadlocal":
+                d[i] = v
+        return d
+
+    def __del__(self):
+        if hasattr(self.threadlocal, "f"):
+            self.threadlocal.f.close()
+            del self.threadlocal.f
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(fasta_file_path(path))
+
+
+class EncodedFastaDataset(FastaDataset):
+    """
+    The FastaDataset returns raw sequences - this allows us to return
+    indices with a dictionary instead.
+    """
+
+    def __init__(self, path, dictionary):
+        super().__init__(path, cache_indices=True)
+        self.dictionary = dictionary
+
+    def __getitem__(self, idx):
+        desc, seq = super().__getitem__(idx)
+        return self.dictionary.encode_line(seq, line_tokenizer=list).long()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
new file mode 100644
index 0000000000..9b61fafadb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .huffman_coder import HuffmanCodeBuilder, HuffmanCoder
+from .huffman_mmap_indexed_dataset import (
+    HuffmanMMapIndex,
+    HuffmanMMapIndexedDataset,
+    HuffmanMMapIndexedDatasetBuilder,
+    vocab_file_path,
+)
+
+__all__ = [
+    "HuffmanCoder",
+    "HuffmanCodeBuilder",
+    "HuffmanMMapIndexedDatasetBuilder",
+    "HuffmanMMapIndexedDataset",
+    "HuffmanMMapIndex",
+    "vocab_file_path",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
new file mode 100644
index 0000000000..c04f84564e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
@@ -0,0 +1,267 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+import typing as tp
+from collections import Counter, deque
+from dataclasses import dataclass
+
+from bitarray import bitarray, util
+from fairseq.data import Dictionary
+
+# basically we have to write to addressable bytes for the memory mapped
+# dataset loader. Sentences that get encoded to a length that is not a
+# multiple of BLOCKSIZE (a byte) will be padded to fit. (see _pad in the coder)
+BLOCKSIZE = 8
+
+
+class HuffmanCoder:
+    def __init__(
+        self, root: "HuffmanNode", bos="<s>", pad="<pad>", eos="</s>", unk="<unk>"
+    ):
+        self.root = root
+        self.table = root.code_table()
+        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
+
+    def _pad(self, a: bitarray) -> bitarray:
+        """
+        bitpadding, 1 then 0.
+
+        If the array is already a multiple of blocksize, we add a full block.
+        """
+        pad_len = BLOCKSIZE - (len(a) % BLOCKSIZE) - 1
+        padding = bitarray("1" + "0" * pad_len)
+        return a + padding
+
+    def _unpad(self, a: bitarray) -> bitarray:
+        """
+        remove the bitpadding.
+
+        There will be a set of 0s preceded by a 1 at the end of the bitarray, we remove that
+        """
+        # count the 0 padding at the end until we find the first 1
+        # we want to remove the one too
+        remove_cnt = util.rindex(a, 1)
+        return a[:remove_cnt]
+
+    def encode(self, iter: tp.List[str]) -> bytes:
+        """
+        encode a list of tokens a return bytes. We use bitpadding to make sure the encoded bits fit in bytes.
+        """
+        a = bitarray()
+        for token in iter:
+            code = self.get_code(token)
+            if code is None:
+                if self.unk_word is None:
+                    raise Exception(f"unknown token {token} cannot be encoded.")
+                else:
+                    token = self.unk_word
+            a = a + self.get_code(token)
+        return self._pad(a).tobytes()
+
+    def decode(self, bits: bytes) -> tp.Iterator["HuffmanNode"]:
+        """
+        take bitpadded bytes and decode it to a set of leaves. You can then use each node to find the symbol/id
+        """
+        a = bitarray()
+        a.frombytes(bits)
+        return self.root.decode(self._unpad(a))
+
+    def get_code(self, symbol: str) -> tp.Optional[bitarray]:
+        node = self.get_node(symbol)
+        return None if node is None else node.code
+
+    def get_node(self, symbol: str) -> "HuffmanNode":
+        return self.table.get(symbol)
+
+    @classmethod
+    def from_file(
+        cls,
+        filename: str,
+        bos="<s>",
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+    ) -> "HuffmanCoder":
+        builder = HuffmanCodeBuilder.from_file(filename)
+        return builder.build_code(bos=bos, pad=pad, eos=eos, unk=unk)
+
+    def to_file(self, filename, sep="\t"):
+        nodes = list(self.table.values())
+        nodes.sort(key=lambda n: n.id)
+        with open(filename, "w", encoding="utf-8") as output:
+            for n in nodes:
+                output.write(f"{n.symbol}{sep}{n.count}\n")
+
+    def __iter__(self):
+        for n in self.table.values():
+            yield n
+
+    def merge(self, other_coder: "HuffmanCoder") -> "HuffmanCoder":
+        builder = HuffmanCodeBuilder()
+        for n in self:
+            builder.increment(n.symbol, n.count)
+        for n in other_coder:
+            builder.increment(n.symbol, n.count)
+        return builder.build_code()
+
+    def __eq__(self, other: "HuffmanCoder") -> bool:
+        return self.table == other.table
+
+    def __len__(self) -> int:
+        return len(self.table)
+
+    def __contains__(self, sym: str) -> bool:
+        return sym in self.table
+
+    def to_dictionary(self) -> Dictionary:
+        dictionary = Dictionary(bos=self.bos, unk=self.unk, pad=self.pad, eos=self.eos)
+        for n in self:
+            dictionary.add_symbol(n.symbol, n=n.count)
+        dictionary.finalize()
+        return dictionary
+
+
+@dataclass
+class HuffmanNode:
+    """
+    a node in a Huffman tree
+    """
+
+    id: int
+    count: int
+    symbol: tp.Optional[str] = None
+    left: tp.Optional["HuffmanNode"] = None
+    right: tp.Optional["HuffmanNode"] = None
+    code: tp.Optional[bitarray] = None
+
+    def is_leaf(self) -> bool:
+        return self.left is None and self.right is None
+
+    def code_table(
+        self, prefix: tp.Optional[bitarray] = None
+    ) -> tp.Dict[str, "HuffmanNode"]:
+        defaulted_prefix = prefix if prefix is not None else bitarray()
+        if self.is_leaf():
+            self.code = (
+                defaulted_prefix if len(defaulted_prefix) > 0 else bitarray("0")
+            )  # leaf could be the root if there is only one symbol
+            return {self.symbol: self}
+
+        codes_right = self.right.code_table(defaulted_prefix + bitarray([0]))
+        codes_left = self.left.code_table(defaulted_prefix + bitarray([1]))
+        return {**codes_left, **codes_right}
+
+    def decode(self, bits: bitarray) -> tp.Iterator["HuffmanNode"]:
+        current_node = self
+        for bit in bits:
+            if bit == 0:  # go right
+                current_node = current_node.right
+            else:  # go left
+                current_node = current_node.left
+            if current_node is None:
+                # we shouldn't be on a leaf here
+                raise Exception("fell off a leaf")
+            if current_node.is_leaf():
+                yield current_node
+                current_node = self
+        if current_node != self:
+            raise Exception("couldn't decode all the bits")
+
+
+class HuffmanCodeBuilder:
+    """
+    build a dictionary with occurence count and then build the Huffman code for it.
+    """
+
+    def __init__(self):
+        self.symbols = Counter()
+
+    def add_symbols(self, *syms) -> None:
+        self.symbols.update(syms)
+
+    def increment(self, symbol: str, cnt: int) -> None:
+        self.symbols[symbol] += cnt
+
+    @classmethod
+    def from_file(cls, filename):
+        c = cls()
+        with open(filename, "r", encoding="utf-8") as input:
+            for line in input:
+                split = re.split(r"[\s]+", line)
+                c.increment(split[0], int(split[1]))
+        return c
+
+    def to_file(self, filename, sep="\t"):
+        with open(filename, "w", encoding="utf-8") as output:
+            for (tok, cnt) in self.symbols.most_common():
+                output.write(f"{tok}{sep}{cnt}\n")
+
+    def _smallest(self, q1: deque, q2: deque) -> HuffmanNode:
+        if len(q1) == 0:
+            return q2.pop()
+
+        if len(q2) == 0:
+            return q1.pop()
+
+        if q1[-1].count < q2[-1].count:
+            return q1.pop()
+
+        return q2.pop()
+
+    def __add__(self, c: "HuffmanCodeBuilder") -> "HuffmanCodeBuilder":
+        new_c = self.symbols + c.symbols
+        new_b = HuffmanCodeBuilder()
+        new_b.symbols = new_c
+        return new_b
+
+    def build_code(
+        self,
+        bos="<s>",
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+    ) -> HuffmanCoder:
+        assert len(self.symbols) > 0, "cannot build code from empty list of symbols"
+
+        if self.symbols[bos] == 0:
+            self.add_symbols(bos)
+        if self.symbols[pad] == 0:
+            self.add_symbols(pad)
+        if self.symbols[eos] == 0:
+            self.add_symbols(eos)
+        if self.symbols[unk] == 0:
+            self.add_symbols(unk)
+
+        node_id = 0
+        leaves_queue = deque(
+            [
+                HuffmanNode(symbol=symbol, count=count, id=idx)
+                for idx, (symbol, count) in enumerate(self.symbols.most_common())
+            ]
+        )  # left are the most common, right are the least common
+
+        if len(leaves_queue) == 1:
+            root = leaves_queue.pop()
+            root.id = 0
+            return HuffmanCoder(root)
+
+        nodes_queue = deque()
+
+        while len(leaves_queue) > 0 or len(nodes_queue) != 1:
+            # get the lowest two nodes at the head of each queue
+            node1 = self._smallest(leaves_queue, nodes_queue)
+            node2 = self._smallest(leaves_queue, nodes_queue)
+
+            # add new node
+            nodes_queue.appendleft(
+                HuffmanNode(
+                    count=node1.count + node2.count, left=node1, right=node2, id=node_id
+                )
+            )
+            node_id += 1
+
+        # we are left with the root
+        return HuffmanCoder(nodes_queue.pop(), bos=bos, pad=pad, eos=eos, unk=unk)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
new file mode 100644
index 0000000000..9b098f2c2b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
@@ -0,0 +1,287 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import mmap
+import os
+import shutil
+import struct
+import typing as tp
+from functools import lru_cache
+
+import numpy as np
+import torch
+from fairseq.data import indexed_dataset
+from fairseq.data.huffman import HuffmanCoder
+from fairseq.file_io import PathManager
+
+
+class HuffmanMMapIndex:
+    """
+    keep an index of the offsets in the huffman binary file.
+    First a header, then the list of sizes (num tokens) for each instance and finally
+    the addresses of each instance.
+    """
+
+    _HDR_MAGIC = b"HUFFIDX\x00\x00"
+    _VERSION = 1
+
+    @classmethod
+    def writer(cls, path: str, data_len: int):
+        class _Writer:
+            def __enter__(self):
+                self._file = open(path, "wb")
+
+                # write header (magic + version)
+                self._file.write(cls._HDR_MAGIC)
+                self._file.write(struct.pack("<Q", cls._VERSION))
+                self._file.write(struct.pack("<Q", data_len))
+
+                return self
+
+            def write(self, sizes, pointers):
+                # add number of items in the index to the header
+                self._file.write(struct.pack("<Q", len(sizes)))
+
+                # write sizes
+                sizes = np.array(sizes, dtype=np.int32)
+                self._file.write(sizes.tobytes(order="C"))
+                del sizes
+
+                # write address pointers
+                pointers = np.array(pointers, dtype=np.int64)
+                self._file.write(pointers.tobytes(order="C"))
+                del pointers
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                self._file.close()
+
+        return _Writer()
+
+    def __init__(self, path):
+        with open(path, "rb") as stream:
+            # read headers
+            magic_test = stream.read(9)
+            assert self._HDR_MAGIC == magic_test, (
+                "Index file doesn't match expected format. "
+                "Make sure that --dataset-impl is configured properly."
+            )
+            (version,) = struct.unpack("<Q", stream.read(8))
+            assert (
+                self._VERSION == version
+            ), f"Unexpected file version{version} != code version {self._VERSION}"
+
+            # read length of data file
+            (self._data_len,) = struct.unpack("<Q", stream.read(8))
+            # read number of items in data file/index
+            (self._len,) = struct.unpack("<Q", stream.read(8))
+            offset = stream.tell()
+
+        indexed_dataset._warmup_mmap_file(path)
+
+        self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+        self._sizes = np.frombuffer(
+            self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
+        )
+        self._pointers = np.frombuffer(
+            self._bin_buffer,
+            dtype=np.int64,
+            count=self._len,
+            offset=offset + self._sizes.nbytes,
+        )
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+
+    def __iter__(self):
+        for i in range(self._len):
+            yield self[i]
+
+    @property
+    def data_len(self):
+        return self._data_len
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        return self._pointers[i], self._sizes[i]
+
+    def __len__(self):
+        return self._len
+
+
+def vocab_file_path(prefix_path):
+    return prefix_path + ".vocab"
+
+
+class HuffmanMMapIndexedDataset(torch.utils.data.Dataset):
+    """
+    an indexed dataset that use mmap and memoryview to access data from disk
+    that was compressed with a HuffmanCoder.
+    """
+
+    def __init__(self, prefix_path):
+        super().__init__()
+
+        self._prefix_path = None
+        self._index = None
+        self._bin_buffer = None
+        self._coder = None
+        self._file = None
+
+        self._bin_buffer_mmap = None
+
+        self._do_init(prefix_path)
+
+    def __getstate__(self):
+        return self._prefix_path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, prefix_path):
+        self._prefix_path = prefix_path
+        self._index = HuffmanMMapIndex(
+            indexed_dataset.index_file_path(self._prefix_path)
+        )
+        self._coder = HuffmanCoder.from_file(vocab_file_path(self._prefix_path))
+
+        indexed_dataset._warmup_mmap_file(
+            indexed_dataset.data_file_path(self._prefix_path)
+        )
+        self._file = os.open(
+            indexed_dataset.data_file_path(self._prefix_path), os.O_RDONLY
+        )
+        self._bin_buffer_mmap = mmap.mmap(
+            self._file,
+            self._index.data_len,
+            access=mmap.ACCESS_READ,
+        )
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        del self._bin_buffer
+        if self._file:
+            os.close(self._file)
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    def _decode(self, i):
+        ptr, _ = self._index[i]
+        if i == 0:
+            raw_bytes = self._bin_buffer[:ptr]
+        else:
+            (prev_ptr, _) = self._index[i - 1]
+            raw_bytes = self._bin_buffer[prev_ptr:ptr]
+
+        return self._coder.decode(raw_bytes.tobytes())
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        nodes = self._decode(i)
+        return torch.tensor([n.id for n in nodes], dtype=torch.int64)
+
+    def __iter__(self):
+        for idx in range(len(self)):
+            yield self[idx]
+
+    def get_symbols(self, i):
+        nodes = self._decode(i)
+        for n in nodes:
+            yield n.symbol
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @property
+    def coder(self):
+        return self._coder
+
+    @staticmethod
+    def exists(prefix_path):
+        return (
+            PathManager.exists(indexed_dataset.index_file_path(prefix_path))
+            and PathManager.exists(indexed_dataset.data_file_path(prefix_path))
+            and PathManager.exists(vocab_file_path(prefix_path))
+        )
+
+
+class HuffmanMMapIndexedDatasetBuilder:
+    """
+    Helper to build a memory mapped datasets with a huffman encoder.
+    You can either open/close this manually or use it as a ContextManager.
+    Provide your own coder, it will then be stored alongside the dataset.
+    The builder will first write the vocab file, then open the binary file so you can stream
+    into it, finally the index will be written when the builder is closed (your index should fit in memory).
+    """
+
+    def __init__(self, path_prefix: str, coder: HuffmanCoder) -> None:
+        self._path_prefix = path_prefix
+        self._coder = coder
+        self._sizes = []
+        self._ptrs = []
+        self._data_len = 0
+
+    def open(self):
+        self._coder.to_file(vocab_file_path(self._path_prefix))
+        self._data_file = open(indexed_dataset.data_file_path(self._path_prefix), "wb")
+
+    def __enter__(self) -> "HuffmanMMapIndexedDatasetBuilder":
+        self.open()
+        return self
+
+    def add_item(self, tokens: tp.List[str]) -> None:
+        """
+        add a list of tokens to the dataset, they will compressed with the
+        provided coder before being written to file.
+        """
+        encoded = self._coder.encode(tokens)
+        code_len = len(encoded)
+        last_ptr = 0
+        if len(self._ptrs) > 0:
+            last_ptr = self._ptrs[-1]
+        self._sizes.append(len(tokens))
+        self._ptrs.append(last_ptr + code_len)
+        self._data_len += code_len
+        self._data_file.write(encoded)
+
+    def append(self, other_dataset_path_prefix: str) -> None:
+        """
+        append an existing dataset.
+        Beware, if it wasn't built with the same coder, you are in trouble.
+        """
+        other_index = HuffmanMMapIndex(
+            indexed_dataset.index_file_path(other_dataset_path_prefix)
+        )
+        for (ptr, size) in other_index:
+            self._ptrs.append(ptr + self._data_len)
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(indexed_dataset.data_file_path(other_dataset_path_prefix), "rb") as f:
+            shutil.copyfileobj(f, self._data_file)
+
+        self._data_len += other_index.data_len
+
+    def close(self):
+        self._data_file.close()
+        with HuffmanMMapIndex.writer(
+            indexed_dataset.index_file_path(self._path_prefix), self._data_len
+        ) as index:
+            index.write(self._sizes, self._ptrs)
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
new file mode 100644
index 0000000000..3e4d7969cf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class IdDataset(FairseqDataset):
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return torch.tensor(samples)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
new file mode 100644
index 0000000000..1947d99408
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
@@ -0,0 +1,592 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import shutil
+import struct
+from functools import lru_cache
+
+import numpy as np
+import torch
+from fairseq.dataclass.constants import DATASET_IMPL_CHOICES
+from fairseq.data.fasta_dataset import FastaDataset
+from fairseq.file_io import PathManager
+from fairseq.data.huffman import HuffmanMMapIndexedDataset, HuffmanMMapIndex
+
+from . import FairseqDataset
+
+from typing import Union
+
+
+def best_fitting_int_dtype(
+    max_int_to_represent,
+) -> Union[np.uint16, np.uint32, np.int64]:
+
+    if max_int_to_represent is None:
+        return np.uint32  # Safe guess
+    elif max_int_to_represent < 65500:
+        return np.uint16
+    elif max_int_to_represent < 4294967295:
+        return np.uint32
+    else:
+        return np.int64
+        # we avoid np.uint64 because it doesn't save space and its type promotion behaves unexpectedly
+        # https://github.com/numpy/numpy/issues/5745
+
+
+def get_available_dataset_impl():
+    return list(map(str, DATASET_IMPL_CHOICES))
+
+
+def infer_dataset_impl(path):
+    if IndexedRawTextDataset.exists(path):
+        return "raw"
+    elif IndexedDataset.exists(path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            if magic == IndexedDataset._HDR_MAGIC:
+                return "cached"
+            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
+                return "mmap"
+            elif magic == HuffmanMMapIndex._HDR_MAGIC[:8]:
+                return "huffman"
+            else:
+                return None
+    elif FastaDataset.exists(path):
+        return "fasta"
+    else:
+        return None
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == "mmap":
+        return MMapIndexedDatasetBuilder(
+            out_file, dtype=best_fitting_int_dtype(vocab_size)
+        )
+    elif impl == "fasta":
+        raise NotImplementedError
+    elif impl == "huffman":
+        raise ValueError(
+            "Use HuffmanCodeBuilder directly as it has a different interface."
+        )
+    else:
+        return IndexedDatasetBuilder(out_file)
+
+
+def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None):
+    if impl == "raw" and IndexedRawTextDataset.exists(path):
+        assert dictionary is not None
+        return IndexedRawTextDataset(path, dictionary)
+    elif impl == "lazy" and IndexedDataset.exists(path):
+        return IndexedDataset(path, fix_lua_indexing=fix_lua_indexing)
+    elif impl == "cached" and IndexedDataset.exists(path):
+        return IndexedCachedDataset(path, fix_lua_indexing=fix_lua_indexing)
+    elif impl == "mmap" and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path)
+    elif impl == "fasta" and FastaDataset.exists(path):
+        from fairseq.data.fasta_dataset import EncodedFastaDataset
+
+        return EncodedFastaDataset(path, dictionary)
+    elif impl == "huffman" and HuffmanMMapIndexedDataset.exists(path):
+        return HuffmanMMapIndexedDataset(path)
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == "raw":
+        return IndexedRawTextDataset.exists(path)
+    elif impl == "mmap":
+        return MMapIndexedDataset.exists(path)
+    elif impl == "huffman":
+        return HuffmanMMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+_code_to_dtype = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float64,
+    7: np.double,
+    8: np.uint16,
+    9: np.uint32,
+    10: np.uint64,
+}
+
+
+def _dtype_header_code(dtype) -> int:
+    for k in _code_to_dtype.keys():
+        if _code_to_dtype[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + ".idx"
+
+
+def data_file_path(prefix_path):
+    return prefix_path + ".bin"
+
+
+class IndexedDataset(FairseqDataset):
+    """Loader for TorchNet IndexedDataset"""
+
+    _HDR_MAGIC = b"TNTIDX\x00\x00"
+
+    def __init__(self, path, fix_lua_indexing=False):
+        super().__init__()
+        self.path = path
+        self.fix_lua_indexing = fix_lua_indexing
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                "Index file doesn't match expected format. "
+                "Make sure that --dataset-impl is configured properly."
+            )
+            version = f.read(8)
+            assert struct.unpack("<Q", version) == (1,)
+            code, self.element_size = struct.unpack("<QQ", f.read(16))
+            self.dtype = _code_to_dtype[code]
+            self._len, self.s = struct.unpack("<QQ", f.read(16))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_longs(f, self.s)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), "rb", buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError("index out of range")
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i) -> torch.Tensor:
+        if not self.data_file:
+            self.read_data(self.path)
+        self.check_index(i)
+        tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+        a = np.empty(tensor_size, dtype=self.dtype)
+        self.data_file.seek(self.data_offsets[i] * self.element_size)
+        self.data_file.readinto(a)
+        item = torch.from_numpy(a).long()
+        if self.fix_lua_indexing:
+            item -= 1  # subtract 1 for 0-based indexing
+        return item
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return PathManager.exists(index_file_path(path)) and PathManager.exists(
+            data_file_path(path)
+        )
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+
+class IndexedCachedDataset(IndexedDataset):
+    def __init__(self, path, fix_lua_indexing=False):
+        super().__init__(path, fix_lua_indexing=fix_lua_indexing)
+        self.cache = None
+        self.cache_index = {}
+
+    @property
+    def supports_prefetch(self):
+        return True
+
+    def prefetch(self, indices):
+        if all(i in self.cache_index for i in indices):
+            return
+        if not self.data_file:
+            self.read_data(self.path)
+        indices = sorted(set(indices))
+        total_size = 0
+        for i in indices:
+            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
+        self.cache = np.empty(total_size, dtype=self.dtype)
+        ptx = 0
+        self.cache_index.clear()
+        for i in indices:
+            self.cache_index[i] = ptx
+            size = self.data_offsets[i + 1] - self.data_offsets[i]
+            a = self.cache[ptx : ptx + size]
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            ptx += size
+        if self.data_file:
+            # close and delete data file after prefetch so we can pickle
+            self.data_file.close()
+            self.data_file = None
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        self.check_index(i)
+        tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+        a = np.empty(tensor_size, dtype=self.dtype)
+        ptx = self.cache_index[i]
+        np.copyto(a, self.cache[ptx : ptx + a.size])
+        item = torch.from_numpy(a).long()
+        if self.fix_lua_indexing:
+            item -= 1  # subtract 1 for 0-based indexing
+        return item
+
+
+class IndexedRawTextDataset(FairseqDataset):
+    """Takes a text file as input and binarizes it in memory at instantiation.
+    Original lines are also kept in memory"""
+
+    def __init__(self, path, dictionary, append_eos=True, reverse_order=False):
+        self.tokens_list = []
+        self.lines = []
+        self.sizes = []
+        self.append_eos = append_eos
+        self.reverse_order = reverse_order
+        self.read_data(path, dictionary)
+        self.size = len(self.tokens_list)
+
+    def read_data(self, path, dictionary):
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                self.lines.append(line.strip("\n"))
+                tokens = dictionary.encode_line(
+                    line,
+                    add_if_not_exist=False,
+                    append_eos=self.append_eos,
+                    reverse_order=self.reverse_order,
+                ).long()
+                self.tokens_list.append(tokens)
+                self.sizes.append(len(tokens))
+        self.sizes = np.array(self.sizes)
+
+    def check_index(self, i):
+        if i < 0 or i >= self.size:
+            raise IndexError("index out of range")
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        self.check_index(i)
+        return self.tokens_list[i]
+
+    def get_original_text(self, i):
+        self.check_index(i)
+        return self.lines[i]
+
+    def __del__(self):
+        pass
+
+    def __len__(self):
+        return self.size
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return PathManager.exists(path)
+
+
+class IndexedDatasetBuilder:
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float64: 4,
+        np.double: 8,
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, "wb")
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+
+    def add_item(self, tensor):
+        # +1 for Lua compatibility
+        bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype))
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.size():
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        begin = self.data_offsets[-1]
+        for offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + offset)
+        self.sizes.extend(index.sizes)
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        with open(data_file_path(another_file), "rb") as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, "wb")
+        index.write(b"TNTIDX\x00\x00")
+        index.write(struct.pack("<Q", 1))
+        index.write(
+            struct.pack("<QQ", _dtype_header_code(self.dtype), self.element_size)
+        )
+        index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_longs(index, self.sizes)
+        index.close()
+
+
+def _warmup_mmap_file(path):
+    with open(path, "rb") as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index:
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer:
+                def __enter__(self):
+                    self._file = open(path, "wb")
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack("<Q", 1))
+                    self._file.write(struct.pack("<B", _dtype_header_code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack("<Q", len(sizes)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order="C"))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order="C"))
+                    del pointers
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path):
+            with open(path, "rb") as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
+                )
+                version = struct.unpack("<Q", stream.read(8))
+                assert (1,) == version
+
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
+                self._dtype = _code_to_dtype[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                offset = stream.tell()
+
+            _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            self._sizes = np.frombuffer(
+                self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
+            )
+            self._pointers = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int64,
+                count=self._len,
+                offset=offset + self._sizes.nbytes,
+            )
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path))
+
+        _warmup_mmap_file(data_file_path(self._path))
+        self._bin_buffer_mmap = np.memmap(
+            data_file_path(self._path), mode="r", order="C"
+        )
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, i):
+        ptr, size = self._index[i]
+        np_array = np.frombuffer(
+            self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
+        )
+        if self._index.dtype != np.int64:
+            np_array = np_array.astype(np.int64)
+
+        return torch.from_numpy(np_array)
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return PathManager.exists(index_file_path(path)) and PathManager.exists(
+            data_file_path(path)
+        )
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        # TODO: a quick fix. make it a child class of FairseqDataset instead?
+        return True
+
+
+def get_indexed_dataset_to_local(path) -> str:
+    local_index_path = PathManager.get_local_path(index_file_path(path))
+    local_data_path = PathManager.get_local_path(data_file_path(path))
+
+    assert local_index_path.endswith(".idx") and local_data_path.endswith(".bin"), (
+        "PathManager.get_local_path does not return files with expected patterns: "
+        f"{local_index_path} and {local_data_path}"
+    )
+
+    local_path = local_data_path[:-4]  # stripping surfix ".bin"
+    assert local_path == local_index_path[:-4]  # stripping surfix ".idx"
+    return local_path
+
+
+class MMapIndexedDatasetBuilder:
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, "wb")
+        self._dtype = dtype
+        self._sizes = []
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order="C"))
+        self._sizes.append(np_array.size)
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        for size in index.sizes:
+            self._sizes.append(size)
+
+        # Concatenate data
+        with open(data_file_path(another_file), "rb") as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
new file mode 100644
index 0000000000..a488265137
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
@@ -0,0 +1,879 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import math
+import operator
+import os
+import queue
+import time
+from threading import Thread
+from typing import Iterator, List
+
+import numpy as np
+import torch
+from fairseq.data import data_utils
+
+
+logger = logging.getLogger(__name__)
+
+# Object used by _background_consumer to signal the source is exhausted
+# to the main thread.
+_sentinel = object()
+
+
+class CountingIterator(object):
+    """Wrapper around an iterable that maintains the iteration count.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        start (int): starting iteration count. Note that this doesn't
+            actually advance the iterator.
+        total (int): override the iterator length returned by ``__len``.
+            This can be used to truncate *iterator*.
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, start=None, total=None):
+        self._itr = iter(iterable)
+        self.n = start or getattr(iterable, "n", 0)
+        self.total = total if total is not None else self.n + len(iterable)
+
+    def __len__(self):
+        return self.total
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if not self.has_next():
+            raise StopIteration
+        try:
+            x = next(self._itr)
+        except StopIteration:
+            raise IndexError(
+                f"Iterator expected to have length {self.total}, "
+                f"but exhausted at position {self.n}."
+            )
+        self.n += 1
+        return x
+
+    def has_next(self):
+        """Whether the iterator has been exhausted."""
+        return self.n < self.total
+
+    def skip(self, n):
+        """Fast-forward the iterator by skipping n elements."""
+        for _ in range(n):
+            next(self)
+        return self
+
+    def take(self, n):
+        """Truncate the iterator to n elements at most."""
+        self.total = min(self.total, n)
+        # Propagate this change to the underlying iterator
+        if hasattr(self._itr, "take"):
+            self._itr.take(max(n - self.n, 0))
+        return self
+
+
+class EpochBatchIterating(object):
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def next_epoch_idx(self):
+        raise NotImplementedError
+
+    def next_epoch_itr(
+        self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True
+    ):
+        """Return a new iterator over the dataset.
+
+        Args:
+            shuffle (bool, optional): shuffle batches before returning the
+                iterator (default: True).
+            fix_batches_to_gpus (bool, optional): ensure that batches are always
+                allocated to the same shards across epochs. Requires
+                that :attr:`dataset` supports prefetching (default: False).
+            set_dataset_epoch (bool, optional): update the wrapped Dataset with
+                the new epoch number (default: True).
+        """
+        raise NotImplementedError
+
+    def end_of_epoch(self) -> bool:
+        """Returns whether the most recent epoch iterator has been exhausted"""
+        raise NotImplementedError
+
+    @property
+    def iterations_in_epoch(self) -> int:
+        """The number of consumed batches in the current epoch."""
+        raise NotImplementedError
+
+    def state_dict(self):
+        """Returns a dictionary containing a whole state of the iterator."""
+        raise NotImplementedError
+
+    def load_state_dict(self, state_dict):
+        """Copies the state of the iterator from the given *state_dict*."""
+        raise NotImplementedError
+
+    @property
+    def first_batch(self):
+        return "DUMMY"
+
+
+class StreamingEpochBatchIterator(EpochBatchIterating):
+    """A steaming-style iterator over a :class:`torch.utils.data.IterableDataset`.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset from which to load the data
+        max_sentences: batch size
+        collate_fn (callable): merges a list of samples to form a mini-batch
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means the data will be loaded in the main process
+            (default: 0).
+        epoch (int, optional): the epoch to start the iterator from
+            (default: 1).
+        buffer_size (int, optional): the number of batches to keep ready in the
+            queue. Helps speeding up dataloading. When buffer_size is zero, the
+            default torch.utils.data.DataLoader preloading is used.
+        timeout (int, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative (default: ``0``).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        max_sentences=1,
+        collate_fn=None,
+        epoch=1,
+        num_workers=0,
+        buffer_size=0,
+        timeout=0,
+        persistent_workers=True,
+    ):
+        assert isinstance(dataset, torch.utils.data.IterableDataset)
+        self.dataset = dataset
+        self.max_sentences = max_sentences
+        self.collate_fn = collate_fn
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
+        self.num_workers = num_workers
+        self.persistent_workers = persistent_workers and num_workers > 0
+        # This upper limit here is to prevent people from abusing this feature
+        # in a shared computing environment.
+        self.buffer_size = min(buffer_size, 20)
+        self.timeout = timeout
+
+        self._current_epoch_iterator = None
+
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        if self._current_epoch_iterator is not None and self.end_of_epoch():
+            return self.epoch + 1
+        else:
+            return self.epoch
+
+    def next_epoch_itr(
+        self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True
+    ):
+        self.epoch = self.next_epoch_idx
+        if set_dataset_epoch and hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(self.epoch)
+        self._current_epoch_iterator = self._get_iterator_for_epoch(self.epoch, shuffle)
+        return self._current_epoch_iterator
+
+    def end_of_epoch(self) -> bool:
+        return not self._current_epoch_iterator.has_next()
+
+    @property
+    def iterations_in_epoch(self) -> int:
+        if self._current_epoch_iterator is not None:
+            return self._current_epoch_iterator.n
+        return 0
+
+    def state_dict(self):
+        return {
+            "epoch": self.epoch,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.epoch = state_dict["epoch"]
+
+    def _get_iterator_for_epoch(self, epoch, shuffle, offset=0):
+        if self.num_workers > 0:
+            os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning"
+
+        # Create data loader
+        worker_init_fn = getattr(self.dataset, "worker_init_fn", None)
+        itr = torch.utils.data.DataLoader(
+            self.dataset,
+            batch_size=self.max_sentences,
+            collate_fn=self.collate_fn,
+            num_workers=self.num_workers,
+            timeout=self.timeout,
+            worker_init_fn=worker_init_fn,
+            pin_memory=True,
+            persistent_workers=self.persistent_workers,
+        )
+
+        # Wrap with a BufferedIterator if needed
+        if self.buffer_size > 0:
+            itr = BufferedIterator(self.buffer_size, itr)
+
+        # Wrap with CountingIterator
+        itr = CountingIterator(itr, start=offset)
+
+        return itr
+
+
+class FrozenBatchSampler:
+    def __init__(
+        self,
+        ordered_batches,
+        epoch,
+        fix_batches_to_gpus,
+        shuffle,
+        initial_offset,
+    ):
+        self.ordered_batches = ordered_batches
+        self.fix_batches_to_gpus = fix_batches_to_gpus
+        self.shuffle = shuffle
+        self.make_batches_for_epoch(epoch, initial_offset)
+
+    def make_batches_for_epoch(self, epoch, offset=0):
+        self.batches = self.ordered_batches(
+            epoch, self.fix_batches_to_gpus, self.shuffle
+        )
+        if offset > 0:
+            self.batches = self.batches[offset:]
+
+    def __iter__(self) -> Iterator[List[int]]:
+        return iter(self.batches)
+
+    def __len__(self) -> int:
+        return len(self.batches)
+
+
+class EpochBatchIterator(EpochBatchIterating):
+    """A multi-epoch iterator over a :class:`torch.utils.data.Dataset`.
+
+    Compared to :class:`torch.utils.data.DataLoader`, this iterator:
+
+    - can be reused across multiple epochs with the :func:`next_epoch_itr`
+      method (optionally shuffled between epochs)
+    - can be serialized/deserialized with the :func:`state_dict` and
+      :func:`load_state_dict` methods
+    - supports sharding with the *num_shards* and *shard_id* arguments
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset from which to load the data
+        collate_fn (callable): merges a list of samples to form a mini-batch
+        batch_sampler (~torch.utils.data.Sampler or a callable): an iterator over batches of
+            indices, or a callable to create such an iterator (~torch.utils.data.Sampler).
+            A callable batch_sampler will be called for each epoch to enable per epoch dynamic
+            batch iterators defined by this callable batch_sampler.
+        seed (int, optional): seed for random number generator for
+            reproducibility (default: 1).
+        num_shards (int, optional): shard the data iterator into N
+            shards (default: 1).
+        shard_id (int, optional): which shard of the data iterator to
+            return (default: 0).
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. 0 means the data will be loaded in the main process
+            (default: 0).
+        epoch (int, optional): the epoch to start the iterator from
+            (default: 1).
+        buffer_size (int, optional): the number of batches to keep ready in the
+            queue. Helps speeding up dataloading. When buffer_size is zero, the
+            default torch.utils.data.DataLoader preloading is used.
+        timeout (int, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative (default: ``0``).
+        disable_shuffling (bool, optional): force disable shuffling
+            (default: ``False``).
+        skip_remainder_batch (bool, optional): if set, discard the last batch in an epoch
+            for the sake of training stability, as the last batch is usually smaller than
+                local_batch_size * distributed_word_size (default: ``False``).
+        grouped_shuffling (bool, optional): enable shuffling batches in groups
+            of num_shards. Ensures that each GPU receives similar length sequences when
+            batches are sorted by length.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        collate_fn,
+        batch_sampler,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+        buffer_size=0,
+        timeout=0,
+        disable_shuffling=False,
+        skip_remainder_batch=False,
+        grouped_shuffling=False,
+        reuse_dataloader=False,
+        persistent_workers=True,
+    ):
+        assert isinstance(dataset, torch.utils.data.Dataset)
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.batch_sampler = batch_sampler
+        self._frozen_batches = (
+            tuple(batch_sampler) if not callable(batch_sampler) else None
+        )
+        self.seed = seed
+        self.num_shards = num_shards
+        self.shard_id = shard_id
+        self.num_workers = num_workers
+        self.persistent_workers = persistent_workers and num_workers > 0
+        # This upper limit here is to prevent people from abusing this feature
+        # in a shared computing environment.
+        self.buffer_size = min(buffer_size, 20)
+        self.timeout = timeout
+        self.disable_shuffling = disable_shuffling
+        self.skip_remainder_batch = skip_remainder_batch
+        self.grouped_shuffling = grouped_shuffling
+
+        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
+        self.shuffle = not disable_shuffling
+        self._cur_epoch_itr = None
+        self._next_epoch_itr = None
+        self._supports_prefetch = getattr(dataset, "supports_prefetch", False)
+
+        self.dataloader = None
+        self.reuse_dataloader = reuse_dataloader
+
+    @property
+    def frozen_batches(self):
+        if self._frozen_batches is None:
+            self._frozen_batches = tuple(self.batch_sampler(self.dataset, self.epoch))
+        return self._frozen_batches
+
+    @property
+    def first_batch(self):
+        if len(self.frozen_batches) == 0:
+            raise Exception(
+                "The dataset is empty. This could indicate "
+                "that all elements in the dataset have been skipped. "
+                "Try increasing the max number of allowed tokens or using "
+                "a larger dataset."
+            )
+
+        if getattr(self.dataset, "supports_fetch_outside_dataloader", True):
+            return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0]])
+        else:
+            return "DUMMY"
+
+    def __len__(self):
+        return int(math.ceil(len(self.frozen_batches) / float(self.num_shards)))
+
+    @property
+    def n(self):
+        return self.iterations_in_epoch
+
+    @property
+    def next_epoch_idx(self):
+        """Return the epoch index after *next_epoch_itr* is called."""
+        if self._next_epoch_itr is not None:
+            return self.epoch
+        elif self._cur_epoch_itr is not None and self.end_of_epoch():
+            return self.epoch + 1
+        else:
+            return self.epoch
+
+    def next_epoch_itr(
+        self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True
+    ):
+        """Return a new iterator over the dataset.
+
+        Args:
+            shuffle (bool, optional): shuffle batches before returning the
+                iterator (default: True).
+            fix_batches_to_gpus (bool, optional): ensure that batches are always
+                allocated to the same shards across epochs. Requires
+                that :attr:`dataset` supports prefetching (default: False).
+            set_dataset_epoch (bool, optional): update the wrapped Dataset with
+                the new epoch number (default: True).
+        """
+        if self.disable_shuffling:
+            shuffle = False
+        prev_epoch = self.epoch
+        self.epoch = self.next_epoch_idx
+        if set_dataset_epoch and hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(self.epoch)
+        if self._next_epoch_itr is not None:
+            self._cur_epoch_itr = self._next_epoch_itr
+            self._next_epoch_itr = None
+        else:
+            if callable(self.batch_sampler) and prev_epoch != self.epoch:
+                # reset _frozen_batches to refresh the next epoch
+                self._frozen_batches = None
+            self._cur_epoch_itr = self._get_iterator_for_epoch(
+                self.epoch,
+                shuffle,
+                fix_batches_to_gpus=fix_batches_to_gpus,
+            )
+        self.shuffle = shuffle
+        return self._cur_epoch_itr
+
+    def end_of_epoch(self) -> bool:
+        """Returns whether the most recent epoch iterator has been exhausted"""
+        return not self._cur_epoch_itr.has_next()
+
+    @property
+    def iterations_in_epoch(self):
+        """The number of consumed batches in the current epoch."""
+        if self._cur_epoch_itr is not None:
+            return self._cur_epoch_itr.n
+        elif self._next_epoch_itr is not None:
+            return self._next_epoch_itr.n
+        return 0
+
+    def state_dict(self):
+        """Returns a dictionary containing a whole state of the iterator."""
+        if self.end_of_epoch():
+            epoch = self.epoch + 1
+            iter_in_epoch = 0
+        else:
+            epoch = self.epoch
+            iter_in_epoch = self.iterations_in_epoch
+        return {
+            "version": 2,
+            "epoch": epoch,
+            "iterations_in_epoch": iter_in_epoch,
+            "shuffle": self.shuffle,
+        }
+
+    def load_state_dict(self, state_dict):
+        """Copies the state of the iterator from the given *state_dict*."""
+        self.epoch = state_dict["epoch"]
+        itr_pos = state_dict.get("iterations_in_epoch", 0)
+        version = state_dict.get("version", 1)
+        if itr_pos > 0:
+            # fast-forward epoch iterator
+            self._next_epoch_itr = self._get_iterator_for_epoch(
+                self.epoch,
+                shuffle=state_dict.get("shuffle", True),
+                offset=itr_pos,
+            )
+            if self._next_epoch_itr is None:
+                if version == 1:
+                    # legacy behavior: we finished the epoch, increment epoch counter
+                    self.epoch += 1
+                else:
+                    raise RuntimeError(
+                        "Cannot resume training due to dataloader mismatch, please "
+                        "report this to the fairseq developers. You can relaunch "
+                        "training with `--reset-dataloader` and it should work."
+                    )
+        else:
+            self._next_epoch_itr = None
+
+    def _get_iterator_for_epoch(
+        self, epoch, shuffle, fix_batches_to_gpus=False, offset=0
+    ):
+        if self.reuse_dataloader and self.dataloader is not None:
+            self.epoch_batch_sampler.make_batches_for_epoch(epoch, offset)
+            itr = self.dataloader
+        else:
+            self.epoch_batch_sampler = FrozenBatchSampler(
+                self.ordered_batches,
+                epoch,
+                fix_batches_to_gpus,
+                shuffle,
+                initial_offset=offset,
+            )
+
+            if offset > 0 and len(self.epoch_batch_sampler) == 0:
+                return None
+
+            if self.num_workers > 0:
+                os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning"
+
+            # Create data loader
+            itr = torch.utils.data.DataLoader(
+                self.dataset,
+                collate_fn=self.collate_fn,
+                batch_sampler=self.epoch_batch_sampler,
+                num_workers=self.num_workers,
+                timeout=self.timeout,
+                pin_memory=True,
+                persistent_workers=self.persistent_workers,
+            )
+
+            if self.reuse_dataloader:
+                self.dataloader = itr
+
+        # Wrap with a BufferedIterator if needed
+        if self.buffer_size > 0:
+            itr = BufferedIterator(self.buffer_size, itr)
+
+        # Wrap with CountingIterator
+        itr = CountingIterator(itr, start=offset)
+
+        if self.skip_remainder_batch:
+            # TODO: Below is a lazy implementation which discard the final batch regardless
+            # of whether it is a full batch or not.
+
+            total_num_itrs = len(self.epoch_batch_sampler) - 1
+            itr.take(total_num_itrs)
+            logger.info(f"skip final residual batch, total_num_itrs = {total_num_itrs}")
+
+        return itr
+
+    def ordered_batches(self, epoch, fix_batches_to_gpus, shuffle):
+        def shuffle_batches(batches, seed):
+            with data_utils.numpy_seed(seed):
+
+                if self.grouped_shuffling:
+                    grouped_batches = [
+                        batches[(i * self.num_shards) : ((i + 1) * self.num_shards)]
+                        for i in range((len(batches) // self.num_shards))
+                    ]
+                    np.random.shuffle(grouped_batches)
+                    batches = list(itertools.chain(*grouped_batches))
+                else:
+                    np.random.shuffle(batches)
+
+            return batches
+
+        if self._supports_prefetch:
+            batches = self.frozen_batches
+
+            if shuffle and not fix_batches_to_gpus:
+                batches = shuffle_batches(list(batches), self.seed + epoch)
+
+            batches = list(
+                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
+            )
+            self.dataset.prefetch([i for s in batches for i in s])
+
+            if shuffle and fix_batches_to_gpus:
+                batches = shuffle_batches(batches, self.seed + epoch + self.shard_id)
+        else:
+            if shuffle:
+                batches = shuffle_batches(list(self.frozen_batches), self.seed + epoch)
+            else:
+                batches = self.frozen_batches
+            batches = list(
+                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
+            )
+        return batches
+
+
+class GroupedIterator(CountingIterator):
+    """Wrapper around an iterable that returns groups (chunks) of items.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        chunk_size (int): size of each chunk
+        skip_remainder_batch (bool, optional): if set, discard the last grouped batch in
+          each training epoch, as the last grouped batch is usually smaller than
+                local_batch_size * distributed_word_size * chunk_size (default: ``False``).
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(self, iterable, chunk_size, skip_remainder_batch=False):
+        if skip_remainder_batch:
+            total_num_itrs = int(math.floor(len(iterable) / float(chunk_size)))
+            logger.info(
+                f"skip final residual batch, grouped total_num_itrs = {total_num_itrs}"
+            )
+        else:
+            total_num_itrs = int(math.ceil(len(iterable) / float(chunk_size)))
+            logger.info(f"grouped total_num_itrs = {total_num_itrs}")
+
+        itr = _chunk_iterator(iterable, chunk_size, skip_remainder_batch)
+        super().__init__(
+            itr,
+            start=int(math.ceil(getattr(iterable, "n", 0) / float(chunk_size))),
+            total=total_num_itrs,
+        )
+        self.chunk_size = chunk_size
+
+        if skip_remainder_batch:
+            self.take(total_num_itrs)
+            # TODO: [Hack] Here the grouped iterator modifies the base iterator size so that
+            # training can move into the next epoch once the grouped iterator is exhausted.
+            # Double-check this implementation in case unexpected behavior occurs.
+            iterable.take(total_num_itrs * chunk_size)
+
+
+def _chunk_iterator(itr, chunk_size, skip_remainder_batch=False):
+    chunk = []
+    for x in itr:
+        chunk.append(x)
+        if len(chunk) == chunk_size:
+            yield chunk
+            chunk = []
+    if not skip_remainder_batch and len(chunk) > 0:
+        yield chunk
+
+
+class ShardedIterator(CountingIterator):
+    """A sharded wrapper around an iterable, padded to length.
+
+    Args:
+        iterable (iterable): iterable to wrap
+        num_shards (int): number of shards to split the iterable into
+        shard_id (int): which shard to iterator over
+        fill_value (Any, optional): padding value when the iterable doesn't
+            evenly divide *num_shards* (default: None).
+
+    Attributes:
+        n (int): number of elements consumed from this iterator
+    """
+
+    def __init__(
+        self, iterable, num_shards, shard_id, fill_value=None, skip_remainder_batch=None
+    ):
+        """
+        Args:
+            skip_remainder_batch: ignored"""
+        if shard_id < 0 or shard_id >= num_shards:
+            raise ValueError("shard_id must be between 0 and num_shards")
+        sharded_len = int(math.ceil(len(iterable) / float(num_shards)))
+        itr = map(
+            operator.itemgetter(1),
+            itertools.zip_longest(
+                range(sharded_len),
+                itertools.islice(iterable, shard_id, len(iterable), num_shards),
+                fillvalue=fill_value,
+            ),
+        )
+        super().__init__(
+            itr,
+            start=int(math.ceil(getattr(iterable, "n", 0) / float(num_shards))),
+            total=sharded_len,
+        )
+
+
+class BackgroundConsumer(Thread):
+    def __init__(self, queue, source, max_len, cuda_device):
+        Thread.__init__(self)
+
+        self._queue = queue
+        self._source = source
+        self._max_len = max_len
+        self.count = 0
+        self.cuda_device = cuda_device
+
+    def run(self):
+        # set_device to avoid creation of GPU0 context when using pin_memory
+        if self.cuda_device is not None:
+            torch.cuda.set_device(self.cuda_device)
+
+        try:
+            for item in self._source:
+                self._queue.put(item)
+
+                # Stop if we reached the maximum length
+                self.count += 1
+                if self._max_len is not None and self.count >= self._max_len:
+                    break
+
+            # Signal the consumer we are done.
+            self._queue.put(_sentinel)
+        except Exception as e:
+            self._queue.put(e)
+
+
+class BufferedIterator(object):
+    def __init__(self, size, iterable):
+        self._queue = queue.Queue(size)
+        self._iterable = iterable
+        self._consumer = None
+
+        self.start_time = time.time()
+        self.warning_time = None
+
+        self.total = len(iterable)
+
+    def _create_consumer(self):
+        self._consumer = BackgroundConsumer(
+            self._queue,
+            self._iterable,
+            self.total,
+            torch.cuda.current_device() if torch.cuda.is_available() else None,
+        )
+        self._consumer.daemon = True
+        self._consumer.start()
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return self.total
+
+    def take(self, n):
+        self.total = min(self.total, n)
+        # Propagate this change to the underlying iterator
+        if hasattr(self._iterable, "take"):
+            self._iterable.take(n)
+        return self
+
+    def __next__(self):
+        # Create consumer if not created yet
+        if self._consumer is None:
+            self._create_consumer()
+
+        # Notify the user if there is a data loading bottleneck
+        if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)):
+            if time.time() - self.start_time > 5 * 60:
+                if (
+                    self.warning_time is None
+                    or time.time() - self.warning_time > 15 * 60
+                ):
+                    logger.debug(
+                        "Data loading buffer is empty or nearly empty. This may "
+                        "indicate a data loading bottleneck, and increasing the "
+                        "number of workers (--num-workers) may help."
+                    )
+                    self.warning_time = time.time()
+
+        # Get next example
+        item = self._queue.get(True)
+        if isinstance(item, Exception):
+            raise item
+        if item is _sentinel:
+            raise StopIteration()
+        return item
+
+
+class GroupedEpochBatchIterator(EpochBatchIterator):
+    """Grouped version of EpochBatchIterator
+    It takes several samplers from different datasets.
+    Each epoch shuffle the dataset wise sampler individually with different
+    random seed. The those sub samplers are combined with into
+    one big samplers with deterministic permutation to mix batches from
+    different datasets. It will act like EpochBatchIterator but make sure
+    1) data from one data set each time
+    2) for different workers, they use the same order to fetch the data
+    so they will use data from the same dataset everytime
+    mult_rate is used for update_freq > 1 case where we want to make sure update_freq
+    mini-batches come from same source
+    """
+
+    def __init__(
+        self,
+        dataset,
+        collate_fn,
+        batch_samplers,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=0,
+        mult_rate=1,
+        buffer_size=0,
+        skip_remainder_batch=False,
+    ):
+        super().__init__(
+            dataset,
+            collate_fn,
+            batch_samplers,
+            seed,
+            num_shards,
+            shard_id,
+            num_workers,
+            epoch,
+            buffer_size,
+            skip_remainder_batch=skip_remainder_batch,
+        )
+        # level 0: sub-samplers 1: batch_idx 2: batches
+        self._frozen_batches = tuple([tuple(sub_batch) for sub_batch in batch_samplers])
+        self.step_size = mult_rate * num_shards
+
+        self.lengths = [
+            (len(x) // self.step_size) * self.step_size for x in self.frozen_batches
+        ]
+
+    def __len__(self):
+        return sum(self.lengths)
+
+    @property
+    def first_batch(self):
+        if len(self.frozen_batches) == 0:
+            raise Exception(
+                "The dataset is empty. This could indicate "
+                "that all elements in the dataset have been skipped. "
+                "Try increasing the max number of allowed tokens or using "
+                "a larger dataset."
+            )
+
+        if self.dataset.supports_fetch_outside_dataloader:
+            return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0][0]])
+        else:
+            return "DUMMY"
+
+    def _get_iterator_for_epoch(
+        self, epoch, shuffle, fix_batches_to_gpus=False, offset=0
+    ):
+        def shuffle_batches(batches, seed):
+            with data_utils.numpy_seed(seed):
+                np.random.shuffle(batches)
+            return batches
+
+        def return_full_batches(batch_sets, seed, shuffle):
+            if shuffle:
+                batch_sets = [shuffle_batches(list(x), seed) for x in batch_sets]
+
+            batch_sets = [
+                batch_sets[i][: self.lengths[i]] for i in range(len(batch_sets))
+            ]
+            batches = list(itertools.chain.from_iterable(batch_sets))
+
+            if shuffle:
+                with data_utils.numpy_seed(seed):
+                    idx = np.random.permutation(len(batches) // self.step_size)
+                    if len(idx) * self.step_size != len(batches):
+                        raise ValueError(
+                            "ERROR: %d %d %d %d"
+                            % (len(idx), self.step_size, len(batches), self.shard_id),
+                            ":".join(["%d" % x for x in self.lengths]),
+                        )
+                    mini_shards = [
+                        batches[i * self.step_size : (i + 1) * self.step_size]
+                        for i in idx
+                    ]
+                    batches = list(itertools.chain.from_iterable(mini_shards))
+
+            return batches
+
+        if self._supports_prefetch:
+            raise NotImplementedError("To be implemented")
+        else:
+            batches = return_full_batches(
+                self.frozen_batches, self.seed + epoch, shuffle
+            )
+            batches = list(
+                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
+            )
+
+        if offset > 0 and offset >= len(batches):
+            return None
+
+        if self.num_workers > 0:
+            os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning"
+
+        itr = torch.utils.data.DataLoader(
+            self.dataset,
+            collate_fn=self.collate_fn,
+            batch_sampler=batches[offset:],
+            num_workers=self.num_workers,
+            persistent_workers=self.persistent_workers,
+        )
+        if self.buffer_size > 0:
+            itr = BufferedIterator(self.buffer_size, itr)
+
+        return CountingIterator(itr, start=offset)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
new file mode 100644
index 0000000000..fd356ddd04
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
@@ -0,0 +1,477 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+import torch
+from fairseq.data import FairseqDataset, data_utils
+
+
+logger = logging.getLogger(__name__)
+
+
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    left_pad_source=True,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+    pad_to_multiple=1,
+):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx,
+            left_pad,
+            move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+            pad_to_multiple=pad_to_multiple,
+        )
+
+    def check_alignment(alignment, src_len, tgt_len):
+        if alignment is None or len(alignment) == 0:
+            return False
+        if (
+            alignment[:, 0].max().item() >= src_len - 1
+            or alignment[:, 1].max().item() >= tgt_len - 1
+        ):
+            logger.warning("alignment size mismatch found, skipping alignment!")
+            return False
+        return True
+
+    def compute_alignment_weights(alignments):
+        """
+        Given a tensor of shape [:, 2] containing the source-target indices
+        corresponding to the alignments, a weight vector containing the
+        inverse frequency of each target index is computed.
+        For e.g. if alignments = [[5, 7], [2, 3], [1, 3], [4, 2]], then
+        a tensor containing [1., 0.5, 0.5, 1] should be returned (since target
+        index 3 is repeated twice)
+        """
+        align_tgt = alignments[:, 1]
+        _, align_tgt_i, align_tgt_c = torch.unique(
+            align_tgt, return_inverse=True, return_counts=True
+        )
+        align_weights = align_tgt_c[align_tgt_i[np.arange(len(align_tgt))]]
+        return 1.0 / align_weights.float()
+
+    id = torch.LongTensor([s["id"] for s in samples])
+    src_tokens = merge(
+        "source",
+        left_pad=left_pad_source,
+        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor(
+        [s["source"].ne(pad_idx).long().sum() for s in samples]
+    )
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge(
+            "target",
+            left_pad=left_pad_target,
+            pad_to_length=pad_to_length["target"]
+            if pad_to_length is not None
+            else None,
+        )
+        target = target.index_select(0, sort_order)
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        ).index_select(0, sort_order)
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens", left_pad=left_pad_target)
+        elif input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                "target",
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length["target"]
+                if pad_to_length is not None
+                else None,
+            )
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+        },
+        "target": target,
+    }
+    if prev_output_tokens is not None:
+        batch["net_input"]["prev_output_tokens"] = prev_output_tokens.index_select(
+            0, sort_order
+        )
+
+    if samples[0].get("alignment", None) is not None:
+        bsz, tgt_sz = batch["target"].shape
+        src_sz = batch["net_input"]["src_tokens"].shape[1]
+
+        offsets = torch.zeros((len(sort_order), 2), dtype=torch.long)
+        offsets[:, 1] += torch.arange(len(sort_order), dtype=torch.long) * tgt_sz
+        if left_pad_source:
+            offsets[:, 0] += src_sz - src_lengths
+        if left_pad_target:
+            offsets[:, 1] += tgt_sz - tgt_lengths
+
+        alignments = [
+            alignment + offset
+            for align_idx, offset, src_len, tgt_len in zip(
+                sort_order, offsets, src_lengths, tgt_lengths
+            )
+            for alignment in [samples[align_idx]["alignment"].view(-1, 2)]
+            if check_alignment(alignment, src_len, tgt_len)
+        ]
+
+        if len(alignments) > 0:
+            alignments = torch.cat(alignments, dim=0)
+            align_weights = compute_alignment_weights(alignments)
+
+            batch["alignments"] = alignments
+            batch["align_weights"] = align_weights
+
+    if samples[0].get("constraints", None) is not None:
+        # Collate the packed constraints across the samples, padding to
+        # the length of the longest sample.
+        lens = [sample.get("constraints").size(0) for sample in samples]
+        max_len = max(lens)
+        constraints = torch.zeros((len(samples), max(lens))).long()
+        for i, sample in enumerate(samples):
+            constraints[i, 0 : lens[i]] = samples[i].get("constraints")
+        batch["constraints"] = constraints.index_select(0, sort_order)
+
+    return batch
+
+
+class LanguagePairDataset(FairseqDataset):
+    """
+    A pair of torch.utils.data.Datasets.
+
+    Args:
+        src (torch.utils.data.Dataset): source dataset to wrap
+        src_sizes (List[int]): source sentence lengths
+        src_dict (~fairseq.data.Dictionary): source vocabulary
+        tgt (torch.utils.data.Dataset, optional): target dataset to wrap
+        tgt_sizes (List[int], optional): target sentence lengths
+        tgt_dict (~fairseq.data.Dictionary, optional): target vocabulary
+        left_pad_source (bool, optional): pad source tensors on the left side
+            (default: True).
+        left_pad_target (bool, optional): pad target tensors on the left side
+            (default: False).
+        shuffle (bool, optional): shuffle dataset elements before batching
+            (default: True).
+        input_feeding (bool, optional): create a shifted version of the targets
+            to be passed into the model for teacher forcing (default: True).
+        remove_eos_from_source (bool, optional): if set, removes eos from end
+            of source if it's present (default: False).
+        append_eos_to_target (bool, optional): if set, appends eos to end of
+            target if it's absent (default: False).
+        align_dataset (torch.utils.data.Dataset, optional): dataset
+            containing alignments.
+        constraints (Tensor, optional): 2d tensor with a concatenated, zero-
+            delimited list of constraints for each sentence.
+        append_bos (bool, optional): if set, appends bos to the beginning of
+            source/target sentence.
+        num_buckets (int, optional): if set to a value greater than 0, then
+            batches will be bucketed into the given number of batch shapes.
+        src_lang_id (int, optional): source language ID, if set, the collated batch
+            will contain a field 'src_lang_id' in 'net_input' which indicates the
+            source language of the samples.
+        tgt_lang_id (int, optional): target language ID, if set, the collated batch
+            will contain a field 'tgt_lang_id' which indicates the target language
+             of the samples.
+    """
+
+    def __init__(
+        self,
+        src,
+        src_sizes,
+        src_dict,
+        tgt=None,
+        tgt_sizes=None,
+        tgt_dict=None,
+        left_pad_source=True,
+        left_pad_target=False,
+        shuffle=True,
+        input_feeding=True,
+        remove_eos_from_source=False,
+        append_eos_to_target=False,
+        align_dataset=None,
+        constraints=None,
+        append_bos=False,
+        eos=None,
+        num_buckets=0,
+        src_lang_id=None,
+        tgt_lang_id=None,
+        pad_to_multiple=1,
+    ):
+        if tgt_dict is not None:
+            assert src_dict.pad() == tgt_dict.pad()
+            assert src_dict.eos() == tgt_dict.eos()
+            assert src_dict.unk() == tgt_dict.unk()
+        if tgt is not None:
+            assert len(src) == len(
+                tgt
+            ), "Source and target must contain the same number of examples"
+        self.src = src
+        self.tgt = tgt
+        self.src_sizes = np.array(src_sizes)
+        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
+        self.sizes = (
+            np.vstack((self.src_sizes, self.tgt_sizes)).T
+            if self.tgt_sizes is not None
+            else self.src_sizes
+        )
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+        self.left_pad_source = left_pad_source
+        self.left_pad_target = left_pad_target
+        self.shuffle = shuffle
+        self.input_feeding = input_feeding
+        self.remove_eos_from_source = remove_eos_from_source
+        self.append_eos_to_target = append_eos_to_target
+        self.align_dataset = align_dataset
+        if self.align_dataset is not None:
+            assert (
+                self.tgt_sizes is not None
+            ), "Both source and target needed when alignments are provided"
+        self.constraints = constraints
+        self.append_bos = append_bos
+        self.eos = eos if eos is not None else src_dict.eos()
+        self.src_lang_id = src_lang_id
+        self.tgt_lang_id = tgt_lang_id
+        if num_buckets > 0:
+            from fairseq.data import BucketPadLengthDataset
+
+            self.src = BucketPadLengthDataset(
+                self.src,
+                sizes=self.src_sizes,
+                num_buckets=num_buckets,
+                pad_idx=self.src_dict.pad(),
+                left_pad=self.left_pad_source,
+            )
+            self.src_sizes = self.src.sizes
+            logger.info("bucketing source lengths: {}".format(list(self.src.buckets)))
+            if self.tgt is not None:
+                self.tgt = BucketPadLengthDataset(
+                    self.tgt,
+                    sizes=self.tgt_sizes,
+                    num_buckets=num_buckets,
+                    pad_idx=self.tgt_dict.pad(),
+                    left_pad=self.left_pad_target,
+                )
+                self.tgt_sizes = self.tgt.sizes
+                logger.info(
+                    "bucketing target lengths: {}".format(list(self.tgt.buckets))
+                )
+
+            # determine bucket sizes using self.num_tokens, which will return
+            # the padded lengths (thanks to BucketPadLengthDataset)
+            num_tokens = np.vectorize(self.num_tokens, otypes=[np.compat.long])
+            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
+            self.buckets = [
+                (None, num_tokens) for num_tokens in np.unique(self.bucketed_num_tokens)
+            ]
+        else:
+            self.buckets = None
+        self.pad_to_multiple = pad_to_multiple
+
+    def get_batch_shapes(self):
+        return self.buckets
+
+    def __getitem__(self, index):
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+        src_item = self.src[index]
+        # Append EOS to end of tgt sentence if it does not have an EOS and remove
+        # EOS from end of src sentence if it exists. This is useful when we use
+        # use existing datasets for opposite directions i.e., when we want to
+        # use tgt_dataset as src_dataset and vice versa
+        if self.append_eos_to_target:
+            eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos()
+            if self.tgt and self.tgt[index][-1] != eos:
+                tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])])
+
+        if self.append_bos:
+            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
+            if self.tgt and self.tgt[index][0] != bos:
+                tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]])
+
+            bos = self.src_dict.bos()
+            if self.src[index][0] != bos:
+                src_item = torch.cat([torch.LongTensor([bos]), self.src[index]])
+
+        if self.remove_eos_from_source:
+            eos = self.src_dict.eos()
+            if self.src[index][-1] == eos:
+                src_item = self.src[index][:-1]
+
+        example = {
+            "id": index,
+            "source": src_item,
+            "target": tgt_item,
+        }
+        if self.align_dataset is not None:
+            example["alignment"] = self.align_dataset[index]
+        if self.constraints is not None:
+            example["constraints"] = self.constraints[index]
+        return example
+
+    def __len__(self):
+        return len(self.src)
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+            pad_to_length (dict, optional): a dictionary of
+                {'source': source_pad_to_length, 'target': target_pad_to_length}
+                to indicate the max length to pad to in source and target respectively.
+
+        Returns:
+            dict: a mini-batch with the following keys:
+
+                - `id` (LongTensor): example IDs in the original input order
+                - `ntokens` (int): total number of tokens in the batch
+                - `net_input` (dict): the input to the Model, containing keys:
+
+                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
+                    the source sentence of shape `(bsz, src_len)`. Padding will
+                    appear on the left if *left_pad_source* is ``True``.
+                  - `src_lengths` (LongTensor): 1D Tensor of the unpadded
+                    lengths of each source sentence of shape `(bsz)`
+                  - `prev_output_tokens` (LongTensor): a padded 2D Tensor of
+                    tokens in the target sentence, shifted right by one
+                    position for teacher forcing, of shape `(bsz, tgt_len)`.
+                    This key will not be present if *input_feeding* is
+                    ``False``.  Padding will appear on the left if
+                    *left_pad_target* is ``True``.
+                  - `src_lang_id` (LongTensor): a long Tensor which contains source
+                    language IDs of each sample in the batch
+
+                - `target` (LongTensor): a padded 2D Tensor of tokens in the
+                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
+                  on the left if *left_pad_target* is ``True``.
+                - `tgt_lang_id` (LongTensor): a long Tensor which contains target language
+                   IDs of each sample in the batch
+        """
+        res = collate(
+            samples,
+            pad_idx=self.src_dict.pad(),
+            eos_idx=self.eos,
+            left_pad_source=self.left_pad_source,
+            left_pad_target=self.left_pad_target,
+            input_feeding=self.input_feeding,
+            pad_to_length=pad_to_length,
+            pad_to_multiple=self.pad_to_multiple,
+        )
+        if self.src_lang_id is not None or self.tgt_lang_id is not None:
+            src_tokens = res["net_input"]["src_tokens"]
+            bsz = src_tokens.size(0)
+            if self.src_lang_id is not None:
+                res["net_input"]["src_lang_id"] = (
+                    torch.LongTensor([[self.src_lang_id]]).expand(bsz, 1).to(src_tokens)
+                )
+            if self.tgt_lang_id is not None:
+                res["tgt_lang_id"] = (
+                    torch.LongTensor([[self.tgt_lang_id]]).expand(bsz, 1).to(src_tokens)
+                )
+        return res
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return max(
+            self.src_sizes[index],
+            self.tgt_sizes[index] if self.tgt_sizes is not None else 0,
+        )
+
+    def num_tokens_vec(self, indices):
+        """Return the number of tokens for a set of positions defined by indices.
+        This value is used to enforce ``--max-tokens`` during batching."""
+        sizes = self.src_sizes[indices]
+        if self.tgt_sizes is not None:
+            sizes = np.maximum(sizes, self.tgt_sizes[indices])
+        return sizes
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (
+            self.src_sizes[index],
+            self.tgt_sizes[index] if self.tgt_sizes is not None else 0,
+        )
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self)).astype(np.int64)
+        else:
+            indices = np.arange(len(self), dtype=np.int64)
+        if self.buckets is None:
+            # sort by target length, then source length
+            if self.tgt_sizes is not None:
+                indices = indices[np.argsort(self.tgt_sizes[indices], kind="mergesort")]
+            return indices[np.argsort(self.src_sizes[indices], kind="mergesort")]
+        else:
+            # sort by bucketed_num_tokens, which is:
+            #   max(padded_src_len, padded_tgt_len)
+            return indices[
+                np.argsort(self.bucketed_num_tokens[indices], kind="mergesort")
+            ]
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.src, "supports_prefetch", False) and (
+            getattr(self.tgt, "supports_prefetch", False) or self.tgt is None
+        )
+
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        if self.tgt is not None:
+            self.tgt.prefetch(indices)
+        if self.align_dataset is not None:
+            self.align_dataset.prefetch(indices)
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """Filter a list of sample indices. Remove those that are longer
+            than specified in max_sizes.
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        return data_utils.filter_paired_dataset_indices_by_size(
+            self.src_sizes,
+            self.tgt_sizes,
+            indices,
+            max_sizes,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
new file mode 100644
index 0000000000..9bd5c72b5e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .block_pair_dataset import BlockPairDataset
+from .masked_lm_dataset import MaskedLMDataset
+from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
+
+
+__all__ = [
+    "BertDictionary",
+    "BlockPairDataset",
+    "MaskedLMDataset",
+    "MaskedLMDictionary",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
new file mode 100644
index 0000000000..ba069b4605
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
@@ -0,0 +1,311 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+from fairseq.data import FairseqDataset
+
+
+class BlockPairDataset(FairseqDataset):
+    """Break a Dataset of tokens into sentence pair blocks for next sentence
+       prediction as well as masked language model.
+
+       High-level logics are:
+       1. break input tensor to tensor blocks
+       2. pair the blocks with 50% next sentence and 50% random sentence
+       3. return paired blocks as well as related segment labels
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to break into blocks
+        sizes: array of sentence lengths
+        dictionary: dictionary for the task
+        block_size: maximum block size
+        break_mode: mode for breaking copurs into block pairs. currently we support
+            2 modes
+            doc: respect document boundaries and each part of the pair should belong to on document
+            none: don't respect any boundary and cut tokens evenly
+        short_seq_prob: probability for generating shorter block pairs
+        doc_break_size: Size for empty line separating documents. Typically 1 if
+                        the sentences have eos, 0 otherwise.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        dictionary,
+        sizes,
+        block_size,
+        break_mode="doc",
+        short_seq_prob=0.1,
+        doc_break_size=1,
+    ):
+        super().__init__()
+        self.dataset = dataset
+        self.pad = dictionary.pad()
+        self.eos = dictionary.eos()
+        self.cls = dictionary.cls()
+        self.mask = dictionary.mask()
+        self.sep = dictionary.sep()
+        self.break_mode = break_mode
+        self.dictionary = dictionary
+        self.short_seq_prob = short_seq_prob
+        self.block_indices = []
+
+        assert len(dataset) == len(sizes)
+
+        if break_mode == "doc":
+            cur_doc = []
+            for sent_id, sz in enumerate(sizes):
+                assert doc_break_size == 0 or sz != 0, (
+                    "when doc_break_size is non-zero, we expect documents to be"
+                    "separated by a blank line with a single eos."
+                )
+                # empty line as document separator
+                if sz == doc_break_size:
+                    if len(cur_doc) == 0:
+                        continue
+                    self.block_indices.append(cur_doc)
+                    cur_doc = []
+                else:
+                    cur_doc.append(sent_id)
+            max_num_tokens = block_size - 3  # Account for [CLS], [SEP], [SEP]
+            self.sent_pairs = []
+            self.sizes = []
+            for doc_id, doc in enumerate(self.block_indices):
+                self._generate_sentence_pair(doc, doc_id, max_num_tokens, sizes)
+        elif break_mode is None or break_mode == "none":
+            # each block should have half of the block size since we are constructing block pair
+            sent_length = (block_size - 3) // 2
+            total_len = sum(dataset.sizes)
+            length = math.ceil(total_len / sent_length)
+
+            def block_at(i):
+                start = i * sent_length
+                end = min(start + sent_length, total_len)
+                return (start, end)
+
+            sent_indices = np.array([block_at(i) for i in range(length)])
+            sent_sizes = np.array([e - s for s, e in sent_indices])
+            dataset_index = self._sent_to_dataset_index(sent_sizes)
+
+            # pair sentences
+            self._pair_sentences(dataset_index)
+        else:
+            raise ValueError("Invalid break_mode: " + break_mode)
+
+    def _pair_sentences(self, dataset_index):
+        """
+        Give a list of evenly cut blocks/sentences, pair these sentences with 50%
+        consecutive sentences and 50% random sentences.
+        This is used for none break mode
+        """
+        # pair sentences
+        for sent_id, sent in enumerate(dataset_index):
+            next_sent_label = (
+                1 if np.random.rand() > 0.5 and sent_id != len(dataset_index) - 1 else 0
+            )
+            if next_sent_label:
+                next_sent = dataset_index[sent_id + 1]
+            else:
+                next_sent = dataset_index[
+                    self._skip_sampling(len(dataset_index), [sent_id, sent_id + 1])
+                ]
+            self.sent_pairs.append((sent, next_sent, next_sent_label))
+
+            # The current blocks don't include the special tokens but the
+            # sizes already account for this
+            self.sizes.append(3 + sent[3] + next_sent[3])
+
+    def _sent_to_dataset_index(self, sent_sizes):
+        """
+        Build index mapping block indices to the underlying dataset indices
+        """
+        dataset_index = []
+        ds_idx, ds_remaining = -1, 0
+        for to_consume in sent_sizes:
+            sent_size = to_consume
+            if ds_remaining == 0:
+                ds_idx += 1
+                ds_remaining = sent_sizes[ds_idx]
+            start_ds_idx = ds_idx
+            start_offset = sent_sizes[ds_idx] - ds_remaining
+            while to_consume > ds_remaining:
+                to_consume -= ds_remaining
+                ds_idx += 1
+                ds_remaining = sent_sizes[ds_idx]
+            ds_remaining -= to_consume
+            dataset_index.append(
+                (
+                    start_ds_idx,  # starting index in dataset
+                    start_offset,  # starting offset within starting index
+                    ds_idx,  # ending index in dataset
+                    sent_size,  # sentence length
+                )
+            )
+        assert ds_remaining == 0
+        assert ds_idx == len(self.dataset) - 1
+        return dataset_index
+
+    def _generate_sentence_pair(self, doc, doc_id, max_num_tokens, sizes):
+        """
+        Go through a single document and genrate sentence paris from it
+        """
+        current_chunk = []
+        current_length = 0
+        curr = 0
+        # To provide more randomness, we decrease target seq length for parts of
+        # samples (10% by default). Note that max_num_tokens is the hard threshold
+        # for batching and will never be changed.
+        target_seq_length = max_num_tokens
+        if np.random.random() < self.short_seq_prob:
+            target_seq_length = np.random.randint(2, max_num_tokens)
+        # loop through all sentences in document
+        while curr < len(doc):
+            sent_id = doc[curr]
+            current_chunk.append(sent_id)
+            current_length = sum(sizes[current_chunk])
+            # split chunk and generate pair when exceed target_seq_length or
+            # finish the loop
+            if curr == len(doc) - 1 or current_length >= target_seq_length:
+                # split the chunk into 2 parts
+                a_end = 1
+                if len(current_chunk) > 2:
+                    a_end = np.random.randint(1, len(current_chunk) - 1)
+                sent_a = current_chunk[:a_end]
+                len_a = sum(sizes[sent_a])
+                # generate next sentence label, note that if there is only 1 sentence
+                # in current chunk, label is always 0
+                next_sent_label = (
+                    1 if np.random.rand() > 0.5 and len(current_chunk) != 1 else 0
+                )
+                if not next_sent_label:
+                    # if next sentence label is 0, sample sent_b from a random doc
+                    target_b_length = target_seq_length - len_a
+                    rand_doc_id = self._skip_sampling(len(self.block_indices), [doc_id])
+                    random_doc = self.block_indices[rand_doc_id]
+                    random_start = np.random.randint(0, len(random_doc))
+                    sent_b = []
+                    len_b = 0
+                    for j in range(random_start, len(random_doc)):
+                        sent_b.append(random_doc[j])
+                        len_b = sum(sizes[sent_b])
+                        if len_b >= target_b_length:
+                            break
+                    # return the second part of the chunk since it's not used
+                    num_unused_segments = len(current_chunk) - a_end
+                    curr -= num_unused_segments
+                else:
+                    # if next sentence label is 1, use the second part of chunk as sent_B
+                    sent_b = current_chunk[a_end:]
+                    len_b = sum(sizes[sent_b])
+                # currently sent_a and sent_B may be longer than max_num_tokens,
+                # truncate them and return block idx and offsets for them
+                sent_a, sent_b = self._truncate_sentences(
+                    sent_a, sent_b, max_num_tokens
+                )
+                self.sent_pairs.append((sent_a, sent_b, next_sent_label))
+                self.sizes.append(3 + sent_a[3] + sent_b[3])
+                current_chunk = []
+            curr += 1
+
+    def _skip_sampling(self, total, skip_ids):
+        """
+        Generate a random integer which is not in skip_ids. Sample range is [0, total)
+        TODO: ids in skip_ids should be consecutive, we can extend it to more generic version later
+        """
+        rand_id = np.random.randint(total - len(skip_ids))
+        return rand_id if rand_id < min(skip_ids) else rand_id + len(skip_ids)
+
+    def _truncate_sentences(self, sent_a, sent_b, max_num_tokens):
+        """
+        Trancate a pair of sentence to limit total length under max_num_tokens
+        Logics:
+            1. Truncate longer sentence
+            2. Tokens to be truncated could be at the beginning or the end of the sentnce
+        Returns:
+            Truncated sentences represented by dataset idx
+        """
+        len_a, len_b = sum(self.dataset.sizes[sent_a]), sum(self.dataset.sizes[sent_b])
+        front_cut_a = front_cut_b = end_cut_a = end_cut_b = 0
+
+        while True:
+            total_length = (
+                len_a + len_b - front_cut_a - front_cut_b - end_cut_a - end_cut_b
+            )
+            if total_length <= max_num_tokens:
+                break
+
+            if len_a - front_cut_a - end_cut_a > len_b - front_cut_b - end_cut_b:
+                if np.random.rand() < 0.5:
+                    front_cut_a += 1
+                else:
+                    end_cut_a += 1
+            else:
+                if np.random.rand() < 0.5:
+                    front_cut_b += 1
+                else:
+                    end_cut_b += 1
+
+        # calculate ds indices as well as offsets and return
+        truncated_sent_a = self._cut_sentence(sent_a, front_cut_a, end_cut_a)
+        truncated_sent_b = self._cut_sentence(sent_b, front_cut_b, end_cut_b)
+        return truncated_sent_a, truncated_sent_b
+
+    def _cut_sentence(self, sent, front_cut, end_cut):
+        """
+        Cut a sentence based on the numbers of tokens to be cut from beginning and end
+        Represent the sentence as dataset idx and return
+        """
+        start_ds_idx, end_ds_idx, offset = sent[0], sent[-1], 0
+        target_len = sum(self.dataset.sizes[sent]) - front_cut - end_cut
+        while front_cut > 0:
+            if self.dataset.sizes[start_ds_idx] > front_cut:
+                offset += front_cut
+                break
+            else:
+                front_cut -= self.dataset.sizes[start_ds_idx]
+                start_ds_idx += 1
+        while end_cut > 0:
+            if self.dataset.sizes[end_ds_idx] > end_cut:
+                break
+            else:
+                end_cut -= self.dataset.sizes[end_ds_idx]
+                end_ds_idx -= 1
+        return start_ds_idx, offset, end_ds_idx, target_len
+
+    def _fetch_block(self, start_ds_idx, offset, end_ds_idx, length):
+        """
+        Fetch a block of tokens based on its dataset idx
+        """
+        buffer = torch.cat(
+            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
+        )
+        s, e = offset, offset + length
+        return buffer[s:e]
+
+    def __getitem__(self, index):
+        block1, block2, next_sent_label = self.sent_pairs[index]
+        block1 = self._fetch_block(*block1)
+        block2 = self._fetch_block(*block2)
+        return block1, block2, next_sent_label
+
+    def __len__(self):
+        return len(self.sizes)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        prefetch_idx = set()
+        for index in indices:
+            for block1, block2, _ in [self.sent_pairs[index]]:
+                for ds_idx in range(block1[0], block1[2] + 1):
+                    prefetch_idx.add(ds_idx)
+                for ds_idx in range(block2[0], block2[2] + 1):
+                    prefetch_idx.add(ds_idx)
+        self.dataset.prefetch(prefetch_idx)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
new file mode 100644
index 0000000000..dd8ea2c60a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
@@ -0,0 +1,303 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from fairseq.data import Dictionary, FairseqDataset, data_utils
+from fairseq.data.concat_dataset import ConcatDataset
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
+from fairseq.data.token_block_dataset import TokenBlockDataset
+
+
+class MaskedLMDataset(FairseqDataset):
+    """
+    A wrapper Dataset for masked language modelling. The dataset
+    wraps around TokenBlockDataset or BlockedPairDataset and creates a batch
+    where the input blocks are masked according to the specified masking
+    probability. Additionally the batch can also contain sentence level targets
+    if this is specified.
+
+    Args:
+        dataset: Dataset which generates blocks of data. Only BlockPairDataset
+            and TokenBlockDataset are supported.
+        sizes: Sentence lengths
+        vocab: Dictionary with the vocabulary and special tokens.
+        pad_idx: Id of padding token in dictionary
+        mask_idx: Id of mask token in dictionary
+        classif_token_idx: Id of classification token in dictionary. This is the
+            token associated with the sentence embedding (Eg: CLS for BERT)
+        sep_token_idx: Id of separator token in dictionary
+            (Eg: SEP in BERT)
+        seed: Seed for random number generator for reproducibility.
+        shuffle: Shuffle the elements before batching.
+        has_pairs: Specifies whether the underlying dataset
+            generates a pair of blocks along with a sentence_target or not.
+            Setting it to True assumes that the underlying dataset generates a
+            label for the pair of sentences which is surfaced as
+            sentence_target. The default value assumes a single block with no
+            sentence target.
+        segment_id: An optional segment id for filling in the segment labels
+            when we are in the single block setting (Eg: XLM). Default is 0.
+        masking_ratio: specifies what percentage of the blocks should be masked.
+        masking_prob: specifies the probability of a given token being
+            replaced with the "MASK" token.
+        random_token_prob: specifies the probability of a given token being
+            replaced by a random token from the vocabulary.
+    """
+
+    def __init__(
+        self,
+        dataset: FairseqDataset,
+        sizes: np.ndarray,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        classif_token_idx: int,
+        sep_token_idx: int,
+        seed: int = 1,
+        shuffle: bool = True,
+        has_pairs: bool = True,
+        segment_id: int = 0,
+        masking_ratio: float = 0.15,
+        masking_prob: float = 0.8,
+        random_token_prob: float = 0.1,
+    ):
+        # Make sure the input datasets are the ones supported
+        assert (
+            isinstance(dataset, TokenBlockDataset)
+            or isinstance(dataset, BlockPairDataset)
+            or isinstance(dataset, ConcatDataset)
+        ), (
+            "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or "
+            "ConcatDataset"
+        )
+
+        self.dataset = dataset
+        self.sizes = np.array(sizes)
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.classif_token_idx = classif_token_idx
+        self.sep_token_idx = sep_token_idx
+        self.shuffle = shuffle
+        self.seed = seed
+        self.has_pairs = has_pairs
+        self.segment_id = segment_id
+        self.masking_ratio = masking_ratio
+        self.masking_prob = masking_prob
+        self.random_token_prob = random_token_prob
+
+        # If we have only one block then sizes needs to be updated to include
+        # the classification token
+        if not has_pairs:
+            self.sizes = self.sizes + 1
+
+    def __getitem__(self, index: int):
+        # if has_pairs, then expect 2 blocks and a sentence target
+        if self.has_pairs:
+            (block_one, block_two, sentence_target) = self.dataset[index]
+        else:
+            block_one = self.dataset[index]
+
+        return {
+            "id": index,
+            "block_one": block_one,
+            "block_two": block_two if self.has_pairs else None,
+            "sentence_target": sentence_target if self.has_pairs else None,
+        }
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _mask_block(
+        self,
+        sentence: np.ndarray,
+        mask_idx: int,
+        pad_idx: int,
+        dictionary_token_range: Tuple,
+    ):
+        """
+        Mask tokens for Masked Language Model training
+        Samples mask_ratio tokens that will be predicted by LM.
+
+        Note:This function may not be efficient enough since we had multiple
+        conversions between np and torch, we can replace them with torch
+        operators later.
+
+        Args:
+            sentence: 1d tensor to be masked
+            mask_idx: index to use for masking the sentence
+            pad_idx: index to use for masking the target for tokens we aren't
+                predicting
+            dictionary_token_range: range of indices in dictionary which can
+                be used for random word replacement
+                (e.g. without special characters)
+        Return:
+            masked_sent: masked sentence
+            target: target with words which we are not predicting replaced
+                by pad_idx
+        """
+        masked_sent = np.copy(sentence)
+        sent_length = len(sentence)
+        mask_num = math.ceil(sent_length * self.masking_ratio)
+        mask = np.random.choice(sent_length, mask_num, replace=False)
+        target = np.copy(sentence)
+
+        for i in range(sent_length):
+            if i in mask:
+                rand = np.random.random()
+
+                # replace with mask if probability is less than masking_prob
+                # (Eg: 0.8)
+                if rand < self.masking_prob:
+                    masked_sent[i] = mask_idx
+
+                # replace with random token if probability is less than
+                # masking_prob + random_token_prob (Eg: 0.9)
+                elif rand < (self.masking_prob + self.random_token_prob):
+                    # sample random token from dictionary
+                    masked_sent[i] = np.random.randint(
+                        dictionary_token_range[0], dictionary_token_range[1]
+                    )
+            else:
+                target[i] = pad_idx
+
+        return masked_sent, target
+
+    def _collate(self, samples: List[Dict], pad_idx: int, eos_idx: int):
+        """
+        Does the heavy lifting for creating a batch from the input list of
+        examples. The logic is as follows:
+            1. Mask the input blocks. In case has_pair is True then we have 2
+               blocks to mask.
+            2. Prepend the first masked block tensor with the special token
+               used as sentence embedding. Eg: CLS in BERT. This happens
+               irrespective of the value of has_pair.
+            3. If has_pair is True, then append the first masked block with the
+               special separator token (eg: SEP for BERT) and compute segment
+               label accordingly. In this case, also append the second masked
+               block with this special separator token and compute its segment
+               label.
+            4. For the targets tensor, prepend and append with padding index
+               accordingly.
+            5. Concatenate all tensors.
+        """
+        if len(samples) == 0:
+            return {}
+        # To ensure determinism, we reset the state of the PRNG after every
+        # batch based on the seed and the first id of the batch. This ensures
+        # that across epochs we get the same mask for the same example. This
+        # is needed for reproducibility and is how BERT does masking
+        # TODO: Can we add deteminism without this constraint?
+        with data_utils.numpy_seed(self.seed + samples[0]["id"]):
+            for s in samples:
+
+                # token range is needed for replacing with random token during
+                # masking
+                token_range = (self.vocab.nspecial, len(self.vocab))
+
+                # mask according to specified probabilities.
+                masked_blk_one, masked_tgt_one = self._mask_block(
+                    s["block_one"],
+                    self.mask_idx,
+                    self.pad_idx,
+                    token_range,
+                )
+
+                tokens = np.concatenate([[self.classif_token_idx], masked_blk_one])
+                targets = np.concatenate([[self.pad_idx], masked_tgt_one])
+                segments = np.ones(len(tokens)) * self.segment_id
+
+                # if has_pairs is True then we need to add the SEP token to both
+                # the blocks after masking and re-compute segments based on the new
+                # lengths.
+                if self.has_pairs:
+                    tokens_one = np.concatenate([tokens, [self.sep_token_idx]])
+                    targets_one = np.concatenate([targets, [self.pad_idx]])
+
+                    masked_blk_two, masked_tgt_two = self._mask_block(
+                        s["block_two"], self.mask_idx, self.pad_idx, token_range
+                    )
+                    tokens_two = np.concatenate([masked_blk_two, [self.sep_token_idx]])
+                    targets_two = np.concatenate([masked_tgt_two, [self.pad_idx]])
+
+                    # block + 1 sep + 1 special (CLS)
+                    segments_one = np.zeros(len(tokens_one))
+                    # block + 1 sep
+                    segments_two = np.ones(len(tokens_two))
+
+                    tokens = np.concatenate([tokens_one, tokens_two])
+                    targets = np.concatenate([targets_one, targets_two])
+                    segments = np.concatenate([segments_one, segments_two])
+
+                s["source"] = torch.LongTensor(tokens)
+                s["segment_labels"] = torch.LongTensor(segments)
+                s["lm_target"] = torch.LongTensor(targets)
+
+        def merge(key):
+            return data_utils.collate_tokens(
+                [s[key] for s in samples], pad_idx, eos_idx, left_pad=False
+            )
+
+        return {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "ntokens": sum(len(s["source"]) for s in samples),
+            "net_input": {
+                "src_tokens": merge("source"),
+                "segment_labels": merge("segment_labels"),
+            },
+            "lm_target": merge("lm_target"),
+            "sentence_target": torch.LongTensor([s["sentence_target"] for s in samples])
+            if self.has_pairs
+            else None,
+            "nsentences": len(samples),
+        }
+
+    def collater(self, samples: List[Dict]):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch of data
+        """
+        return self._collate(samples, self.vocab.pad(), self.vocab.eos())
+
+    def num_tokens(self, index: int):
+        """
+        Return the number of tokens in a sample. This value is used to
+        enforce max-tokens during batching.
+        """
+        return self.sizes[index]
+
+    def size(self, index: int):
+        """
+        Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with max-positions.
+        """
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """
+        Return an ordered list of indices. Batches will be constructed based
+        on this order.
+        """
+        if self.shuffle:
+            return np.random.permutation(len(self))
+        else:
+            order = [np.arange(len(self))]
+            order.append(self.sizes)
+            return np.lexsort(order)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
new file mode 100644
index 0000000000..dee88f7a3e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
@@ -0,0 +1,60 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data import Dictionary
+
+
+class MaskedLMDictionary(Dictionary):
+    """
+    Dictionary for Masked Language Modelling tasks. This extends Dictionary by
+    adding the mask symbol.
+    """
+
+    def __init__(
+        self,
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+        mask="<mask>",
+    ):
+        super().__init__(pad=pad, eos=eos, unk=unk)
+        self.mask_word = mask
+        self.mask_index = self.add_symbol(mask)
+        self.nspecial = len(self.symbols)
+
+    def mask(self):
+        """Helper to get index of mask symbol"""
+        return self.mask_index
+
+
+class BertDictionary(MaskedLMDictionary):
+    """
+    Dictionary for BERT task. This extends MaskedLMDictionary by adding support
+    for cls and sep symbols.
+    """
+
+    def __init__(
+        self,
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+        mask="<mask>",
+        cls="<cls>",
+        sep="<sep>",
+    ):
+        super().__init__(pad=pad, eos=eos, unk=unk, mask=mask)
+        self.cls_word = cls
+        self.sep_word = sep
+        self.cls_index = self.add_symbol(cls)
+        self.sep_index = self.add_symbol(sep)
+        self.nspecial = len(self.symbols)
+
+    def cls(self):
+        """Helper to get index of cls symbol"""
+        return self.cls_index
+
+    def sep(self):
+        """Helper to get index of sep symbol"""
+        return self.sep_index
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
new file mode 100644
index 0000000000..12f00aa436
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class ListDataset(BaseWrapperDataset):
+    def __init__(self, dataset, sizes=None):
+        super().__init__(dataset)
+        self._sizes = sizes
+
+    def __iter__(self):
+        for x in self.dataset:
+            yield x
+
+    def collater(self, samples):
+        return samples
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    def set_epoch(self, epoch):
+        pass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
new file mode 100644
index 0000000000..1a945927cf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from typing import Dict
+
+from fairseq.data.monolingual_dataset import MonolingualDataset
+
+from . import FairseqDataset
+
+
+class LMContextWindowDataset(FairseqDataset):
+    """
+    Wraps a MonolingualDataset and provides more context for evaluation.
+
+    Each item in the new dataset will have a maximum size of
+    ``tokens_per_sample + context_window``.
+
+    Args:
+        dataset: dataset to wrap
+        tokens_per_sample (int): the max number of tokens in each dataset item
+        context_window (int): the number of accumulated tokens to add to each
+            dataset item
+        pad_idx (int): padding symbol
+    """
+
+    def __init__(
+        self,
+        dataset: MonolingualDataset,
+        tokens_per_sample: int,
+        context_window: int,
+        pad_idx: int,
+    ):
+        assert context_window > 0
+        self.dataset = dataset
+        self.tokens_per_sample = tokens_per_sample
+        self.context_window = context_window
+        self.pad_idx = pad_idx
+        self.prev_tokens = np.empty([0])
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples) -> Dict:
+        sample = self.dataset.collater(samples)
+
+        pad = self.pad_idx
+        max_sample_len = self.tokens_per_sample + self.context_window
+
+        bsz, tsz = sample["net_input"]["src_tokens"].shape
+        start_idxs = [0] * bsz
+        toks = sample["net_input"]["src_tokens"]
+        lengths = sample["net_input"]["src_lengths"]
+        tgt = sample["target"]
+        new_toks = np.empty([bsz, tsz + self.context_window], dtype=np.int64)
+        new_tgt = np.full([bsz, tsz + self.context_window], pad, dtype=np.int64)
+        sample_lens = toks.ne(pad).long().sum(dim=1).cpu()
+        for i in range(bsz):
+            sample_len = sample_lens[i]
+            extra = len(self.prev_tokens) + sample_len - max_sample_len
+            if extra > 0:
+                self.prev_tokens = self.prev_tokens[extra:]
+            pads = np.full(self.context_window - len(self.prev_tokens), pad)
+            new_toks[i] = np.concatenate([self.prev_tokens, toks[i].numpy(), pads])
+            new_tgt[
+                i, len(self.prev_tokens) : len(self.prev_tokens) + len(tgt[i])
+            ] = tgt[i]
+            start_idxs[i] = len(self.prev_tokens)
+            lengths[i] += len(self.prev_tokens)
+            self.prev_tokens = new_toks[i][new_toks[i] != pad][-self.context_window :]
+        sample["net_input"]["src_tokens"] = torch.from_numpy(new_toks)
+        sample["target"] = torch.from_numpy(new_tgt)
+        sample["start_indices"] = start_idxs
+        return sample
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        # NOTE we don't shuffle the data to retain access to the previous dataset elements
+        return np.arange(len(self.dataset))
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
new file mode 100644
index 0000000000..a7854ac170
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import lru_cache
+
+from . import BaseWrapperDataset
+
+
+class LRUCacheDataset(BaseWrapperDataset):
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+
+    @lru_cache(maxsize=8)
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    @lru_cache(maxsize=8)
+    def collater(self, samples):
+        return self.dataset.collater(samples)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
new file mode 100644
index 0000000000..0ca9051c9a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
@@ -0,0 +1,226 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import lru_cache
+
+import numpy as np
+import torch
+from fairseq.data import Dictionary, data_utils
+
+from . import BaseWrapperDataset, LRUCacheDataset
+
+
+class MaskTokensDataset(BaseWrapperDataset):
+    """
+    A wrapper Dataset for masked language modeling.
+
+    Input items are masked according to the specified masking probability.
+
+    Args:
+        dataset: Dataset to wrap.
+        sizes: Sentence lengths
+        vocab: Dictionary with the vocabulary and special tokens.
+        pad_idx: Id of pad token in vocab
+        mask_idx: Id of mask token in vocab
+        return_masked_tokens: controls whether to return the non-masked tokens
+            (the default) or to return a tensor with the original masked token
+            IDs (and *pad_idx* elsewhere). The latter is useful as targets for
+            masked LM training.
+        seed: Seed for random number generator for reproducibility.
+        mask_prob: probability of replacing a token with *mask_idx*.
+        leave_unmasked_prob: probability that a masked token is unmasked.
+        random_token_prob: probability of replacing a masked token with a
+            random token from the vocabulary.
+        freq_weighted_replacement: sample random replacement words based on
+            word frequencies in the vocab.
+        mask_whole_words: only mask whole words. This should be a byte mask
+            over vocab indices, indicating whether it is the beginning of a
+            word. We will extend any mask to encompass the whole word.
+        bpe: BPE to use for whole-word masking.
+        mask_multiple_length : repeat each mask index multiple times. Default
+            value is 1.
+        mask_stdev : standard deviation of masks distribution in case of
+            multiple masking. Default value is 0.
+    """
+
+    @classmethod
+    def apply_mask(cls, dataset: torch.utils.data.Dataset, *args, **kwargs):
+        """Return the source and target datasets for masked LM training."""
+        dataset = LRUCacheDataset(dataset)
+        return (
+            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=False)),
+            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=True)),
+        )
+
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        return_masked_tokens: bool = False,
+        seed: int = 1,
+        mask_prob: float = 0.15,
+        leave_unmasked_prob: float = 0.1,
+        random_token_prob: float = 0.1,
+        freq_weighted_replacement: bool = False,
+        mask_whole_words: torch.Tensor = None,
+        mask_multiple_length: int = 1,
+        mask_stdev: float = 0.0,
+        skip_masking: bool = False,
+    ):
+        assert 0.0 < mask_prob < 1.0
+        assert 0.0 <= random_token_prob <= 1.0
+        assert 0.0 <= leave_unmasked_prob <= 1.0
+        assert random_token_prob + leave_unmasked_prob <= 1.0
+        assert mask_multiple_length >= 1
+        assert mask_stdev >= 0.0
+
+        self.dataset = dataset
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.return_masked_tokens = return_masked_tokens
+        self.seed = seed
+        self.mask_prob = mask_prob
+        self.leave_unmasked_prob = leave_unmasked_prob
+        self.random_token_prob = random_token_prob
+        self.mask_whole_words = mask_whole_words
+        self.mask_multiple_length = mask_multiple_length
+        self.mask_stdev = mask_stdev
+        self.skip_masking = skip_masking
+
+        if random_token_prob > 0.0:
+            if freq_weighted_replacement:
+                weights = np.array(self.vocab.count)
+            else:
+                weights = np.ones(len(self.vocab))
+            weights[: self.vocab.nspecial] = 0
+            self.weights = weights / weights.sum()
+
+        self.epoch = 0
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True  # only the noise changes, not item sizes
+
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+
+    def __getitem__(self, index: int):
+        return self.__getitem_cached__(self.seed, self.epoch, index)
+
+    @lru_cache(maxsize=8)
+    def __getitem_cached__(self, seed: int, epoch: int, index: int):
+        seed = int(hash((seed, epoch, index)) % 1e6)
+        rng = np.random.default_rng(seed)
+        item = self.dataset[index]
+        sz = len(item)
+
+        assert (
+            self.mask_idx not in item
+        ), "Dataset contains mask_idx (={}), this is not expected!".format(
+            self.mask_idx,
+        )
+        if self.skip_masking:
+            return torch.from_numpy(np.copy(item))
+
+        if self.mask_whole_words is not None:
+            word_begins_mask = self.mask_whole_words.gather(0, item)
+            word_begins_idx = word_begins_mask.nonzero().view(-1)
+            sz = len(word_begins_idx)
+            words = np.split(word_begins_mask, word_begins_idx)[1:]
+            assert len(words) == sz
+            word_lens = list(map(len, words))
+
+        # decide elements to mask
+        mask = np.full(sz, False)
+        num_mask = int(
+            # add a random number for probabilistic rounding
+            self.mask_prob * sz / float(self.mask_multiple_length)
+            + rng.random()
+        )
+
+        # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453)
+        mask_idc = rng.choice(sz, num_mask, replace=False)
+        if self.mask_stdev > 0.0:
+            lengths = rng.normal(
+                self.mask_multiple_length, self.mask_stdev, size=num_mask
+            )
+            lengths = [max(0, int(round(x))) for x in lengths]
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ],
+                dtype=np.int64,
+            )
+        else:
+            mask_idc = np.concatenate(
+                [mask_idc + i for i in range(self.mask_multiple_length)]
+            )
+        mask_idc = mask_idc[mask_idc < len(mask)]
+        try:
+            mask[mask_idc] = True
+        except:  # something wrong
+            print("Assigning mask indexes {} to mask {} failed!".format(mask_idc, mask))
+            raise
+
+        # if self.return_masked_tokens:
+        #     print((
+        #         f"IDX={index}; seed={seed}; epoch={epoch}; is_tgt={self.return_masked_tokens}: "
+        #         f"{np.nonzero(mask)[0].sum()}"
+        #     ))
+        if self.return_masked_tokens:
+            # exit early if we're just returning the masked tokens
+            # (i.e., the targets for masked LM training)
+            if self.mask_whole_words is not None:
+                mask = np.repeat(mask, word_lens)
+            new_item = np.full(len(mask), self.pad_idx)
+            new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1]
+            return torch.from_numpy(new_item)
+
+        # decide unmasking and random replacement
+        rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob
+        if rand_or_unmask_prob > 0.0:
+            rand_or_unmask = mask & (rng.random(sz) < rand_or_unmask_prob)
+            if self.random_token_prob == 0.0:
+                unmask = rand_or_unmask
+                rand_mask = None
+            elif self.leave_unmasked_prob == 0.0:
+                unmask = None
+                rand_mask = rand_or_unmask
+            else:
+                unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob
+                decision = rng.random(sz) < unmask_prob
+                unmask = rand_or_unmask & decision
+                rand_mask = rand_or_unmask & (~decision)
+        else:
+            unmask = rand_mask = None
+
+        if unmask is not None:
+            mask = mask ^ unmask
+
+        if self.mask_whole_words is not None:
+            mask = np.repeat(mask, word_lens)
+
+        new_item = np.copy(item)
+        new_item[mask] = self.mask_idx
+        if rand_mask is not None:
+            num_rand = rand_mask.sum()
+            if num_rand > 0:
+                if self.mask_whole_words is not None:
+                    rand_mask = np.repeat(rand_mask, word_lens)
+                    num_rand = rand_mask.sum()
+
+                new_item[rand_mask] = rng.choice(
+                    len(self.vocab),
+                    num_rand,
+                    p=self.weights,
+                )
+
+        return torch.from_numpy(new_item)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
new file mode 100644
index 0000000000..54fd583b64
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
@@ -0,0 +1,253 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import FairseqDataset, data_utils
+
+
+def collate(samples, pad_idx, eos_idx, fixed_pad_length=None, pad_to_bsz=None):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, is_list=False):
+        if is_list:
+            res = []
+            for i in range(len(samples[0][key])):
+                res.append(
+                    data_utils.collate_tokens(
+                        [s[key][i] for s in samples],
+                        pad_idx,
+                        eos_idx,
+                        left_pad=False,
+                        pad_to_length=fixed_pad_length,
+                        pad_to_bsz=pad_to_bsz,
+                    )
+                )
+            return res
+        else:
+            return data_utils.collate_tokens(
+                [s[key] for s in samples],
+                pad_idx,
+                eos_idx,
+                left_pad=False,
+                pad_to_length=fixed_pad_length,
+                pad_to_bsz=pad_to_bsz,
+            )
+
+    src_tokens = merge("source")
+    if samples[0]["target"] is not None:
+        is_target_list = isinstance(samples[0]["target"], list)
+        target = merge("target", is_target_list)
+    else:
+        target = src_tokens
+
+    return {
+        "id": torch.LongTensor([s["id"] for s in samples]),
+        "nsentences": len(samples),
+        "ntokens": sum(len(s["source"]) for s in samples),
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": torch.LongTensor([s["source"].numel() for s in samples]),
+        },
+        "target": target,
+    }
+
+
+class MonolingualDataset(FairseqDataset):
+    """
+    A wrapper around torch.utils.data.Dataset for monolingual data.
+
+    Args:
+        dataset (torch.utils.data.Dataset): dataset to wrap
+        sizes (List[int]): sentence lengths
+        vocab (~fairseq.data.Dictionary): vocabulary
+        shuffle (bool, optional): shuffle the elements before batching
+            (default: True).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        src_vocab,
+        tgt_vocab=None,
+        add_eos_for_other_targets=False,
+        shuffle=False,
+        targets=None,
+        add_bos_token=False,
+        fixed_pad_length=None,
+        pad_to_bsz=None,
+        src_lang_idx=None,
+        tgt_lang_idx=None,
+    ):
+        self.dataset = dataset
+        self.sizes = np.array(sizes)
+        self.vocab = src_vocab
+        self.tgt_vocab = tgt_vocab or src_vocab
+        self.add_eos_for_other_targets = add_eos_for_other_targets
+        self.shuffle = shuffle
+        self.add_bos_token = add_bos_token
+        self.fixed_pad_length = fixed_pad_length
+        self.pad_to_bsz = pad_to_bsz
+        self.src_lang_idx = src_lang_idx
+        self.tgt_lang_idx = tgt_lang_idx
+
+        assert targets is None or all(
+            t in {"self", "future", "past"} for t in targets
+        ), "targets must be none or one of 'self', 'future', 'past'"
+        if targets is not None and len(targets) == 0:
+            targets = None
+        self.targets = targets
+
+    def __getitem__(self, index):
+        if self.targets is not None:
+            # *future_target* is the original sentence
+            # *source* is shifted right by 1 (maybe left-padded with eos)
+            # *past_target* is shifted right by 2 (left-padded as needed)
+            #
+            # Left-to-right language models should condition on *source* and
+            # predict *future_target*.
+            # Right-to-left language models should condition on *source* and
+            # predict *past_target*.
+            source, future_target, past_target = self.dataset[index]
+            source, target = self._make_source_target(
+                source, future_target, past_target
+            )
+        else:
+            source = self.dataset[index]
+            target = None
+        source, target = self._maybe_add_bos(source, target)
+        return {"id": index, "source": source, "target": target}
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def _make_source_target(self, source, future_target, past_target):
+        if self.targets is not None:
+            target = []
+
+            if (
+                self.add_eos_for_other_targets
+                and (("self" in self.targets) or ("past" in self.targets))
+                and source[-1] != self.vocab.eos()
+            ):
+                # append eos at the end of source
+                source = torch.cat([source, source.new([self.vocab.eos()])])
+
+                if "future" in self.targets:
+                    future_target = torch.cat(
+                        [future_target, future_target.new([self.vocab.pad()])]
+                    )
+                if "past" in self.targets:
+                    # first token is before the start of sentence which is only used in "none" break mode when
+                    # add_eos_for_other_targets is False
+                    past_target = torch.cat(
+                        [
+                            past_target.new([self.vocab.pad()]),
+                            past_target[1:],
+                            source[-2, None],
+                        ]
+                    )
+
+            for t in self.targets:
+                if t == "self":
+                    target.append(source)
+                elif t == "future":
+                    target.append(future_target)
+                elif t == "past":
+                    target.append(past_target)
+                else:
+                    raise Exception("invalid target " + t)
+
+            if len(target) == 1:
+                target = target[0]
+        else:
+            target = future_target
+
+        return source, self._filter_vocab(target)
+
+    def _maybe_add_bos(self, source, target):
+        if self.add_bos_token:
+            source = torch.cat([source.new([self.vocab.bos()]), source])
+            if target is not None:
+                target = torch.cat([target.new([self.tgt_vocab.bos()]), target])
+        return source, target
+
+    def num_tokens_vec(self, indices):
+        """Return the number of tokens for a set of positions defined by indices.
+        This value is used to enforce ``--max-tokens`` during batching."""
+        return self.sizes[indices]
+
+    def _filter_vocab(self, target):
+        if len(self.tgt_vocab) != len(self.vocab):
+
+            def _filter(target):
+                mask = target.ge(len(self.tgt_vocab))
+                if mask.any():
+                    target[mask] = self.tgt_vocab.unk()
+                return target
+
+            if isinstance(target, list):
+                return [_filter(t) for t in target]
+            return _filter(target)
+        return target
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch with the following keys:
+
+                - `id` (LongTensor): example IDs in the original input order
+                - `ntokens` (int): total number of tokens in the batch
+                - `net_input` (dict): the input to the Model, containing keys:
+
+                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
+                    the source sentence of shape `(bsz, src_len)`. Padding will
+                    appear on the right.
+
+                - `target` (LongTensor): a padded 2D Tensor of tokens in the
+                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
+                  on the right.
+        """
+        return collate(
+            samples,
+            self.vocab.pad(),
+            self.vocab.eos(),
+            self.fixed_pad_length,
+            self.pad_to_bsz,
+        )
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
new file mode 100644
index 0000000000..6f2fe074b2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
@@ -0,0 +1,285 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import asyncio
+import logging
+import time
+from collections import OrderedDict
+from typing import Dict, List, Optional
+
+import numpy as np
+
+from fairseq.data import data_utils
+
+from . import FairseqDataset
+
+logger = logging.getLogger(__name__)
+
+
+class MultiCorpusDataset(FairseqDataset):
+    """
+    Stores multiple instances of FairseqDataset together.
+    Unless batch_sample=True, requires each instance
+    to be the same dataset, as the collate method needs to work on batches with
+    samples from each dataset.
+
+    Allows specifying a distribution over the datasets to use. Note that unlike
+    MultiCorpusSampledDataset, this distribution allows sampling for each item,
+    rather than on a batch level. Note that datasets with sampling probabilty
+    of 0 will be skipped.
+
+    Each time ordered_indices() is called, a new sample is generated with
+    the specified distribution.
+
+    Args:
+        datasets: a OrderedDict of FairseqDataset instances.
+        distribution: a List containing the probability of getting an utterance from
+                        corresponding dataset
+        seed: random seed for sampling the datsets
+        sort_indices: if true, will sort the ordered indices by size
+        batch_sample: if true, will ensure each batch is from a single dataset
+    """
+
+    def __init__(
+        self,
+        datasets: Dict[str, FairseqDataset],
+        distribution: List[float],
+        seed: int,
+        sort_indices: bool = False,
+        batch_sample: bool = False,
+        distributed_rank: Optional[int] = None,
+    ):
+        super().__init__()
+        assert isinstance(datasets, OrderedDict)
+        assert len(datasets) == len(distribution)
+        assert sum(distribution) == 1
+        self.datasets = datasets
+        self.distribution = distribution
+        self.seed = seed
+        self.sort_indices = sort_indices
+        self.batch_sample = batch_sample
+        self.distributed_rank = distributed_rank
+
+        # Avoid repeated conversions to list later
+        self.dataset_list = list(datasets.values())
+        self.total_num_instances = 0
+
+        first_dataset = self.dataset_list[0]
+
+        self.num_instances_per_dataset = []
+        self.dataset_offsets = []
+        for i, dataset in enumerate(self.dataset_list):
+            assert isinstance(dataset, FairseqDataset)
+            assert type(dataset) is type(first_dataset)
+            self.num_instances_per_dataset.append(
+                0 if self.distribution[i] == 0 else len(dataset)
+            )
+            self.dataset_offsets.append(self.total_num_instances)
+            self.total_num_instances += self.num_instances_per_dataset[i]
+
+    def ordered_indices(self):
+        start = time.time()
+        with data_utils.numpy_seed(self.seed, self.epoch):
+            logger.info(
+                f"sampling new dataset with seed {self.seed} epoch {self.epoch}"
+            )
+            sampled_indices = []
+            num_selected_instances = 0
+
+            # For each dataset i, sample self.distribution[i] * self.total_num_instances
+            for i, key in enumerate(self.datasets):
+                if self.distribution[i] == 0:
+                    # skip dataset if sampling probability is 0
+                    continue
+
+                if i < len(self.datasets) - 1:
+                    num_instances = int(self.distribution[i] * self.total_num_instances)
+                    high = self.dataset_offsets[i + 1]
+                else:
+                    num_instances = self.total_num_instances - num_selected_instances
+                    high = self.total_num_instances
+
+                logger.info(f"sampling {num_instances} from {key} dataset")
+                num_selected_instances += num_instances
+
+                # First, add k copies of the dataset where k = num_instances // len(dataset).
+                # This ensures an equal distribution of the data points as much as possible.
+                # For the remaining entries randomly sample them
+                dataset_size = len(self.datasets[key])
+                num_copies = num_instances // dataset_size
+                dataset_indices = (
+                    np.random.permutation(high - self.dataset_offsets[i])
+                    + self.dataset_offsets[i]
+                )[: num_instances - num_copies * dataset_size]
+                if num_copies > 0:
+                    sampled_indices += list(
+                        np.concatenate(
+                            (
+                                np.repeat(
+                                    np.arange(self.dataset_offsets[i], high), num_copies
+                                ),
+                                dataset_indices,
+                            )
+                        )
+                    )
+                else:
+                    sampled_indices += list(dataset_indices)
+
+            assert (
+                len(sampled_indices) == self.total_num_instances
+            ), f"{len(sampled_indices)} vs {self.total_num_instances}"
+
+            np.random.shuffle(sampled_indices)
+            if self.sort_indices:
+                sampled_indices.sort(key=lambda i: self.num_tokens(i))
+
+            logger.info(
+                "multi_corpus_dataset ordered_indices took {}s".format(
+                    time.time() - start
+                )
+            )
+            return np.array(sampled_indices, dtype=np.int64)
+
+    def _map_index(self, index: int):
+        """
+        If dataset A has length N and dataset B has length M
+        then index 1 maps to index 1 of dataset A, and index N + 1
+        maps to index 1 of B.
+        """
+        counter = 0
+        for num_instances, key in zip(self.num_instances_per_dataset, self.datasets):
+            if index < counter + num_instances:
+                return index - counter, key
+            counter += num_instances
+        raise ValueError(
+            "Invalid index: {}, max: {}".format(index, self.total_num_instances)
+        )
+
+    def __len__(self):
+        """
+        Length of this dataset is the sum of individual datasets
+        """
+        return self.total_num_instances
+
+    async def getitem(self, index):
+        new_index, key = self._map_index(index)
+        try:
+            if hasattr(self.datasets[key], "getitem"):
+                item = await self.datasets[key].getitem(new_index)
+            else:
+                item = self.datasets[key][new_index]
+            item["full_id"] = index
+            return item
+        except Exception as e:
+            e.args = (f"Error from {key} dataset", *e.args)
+            raise
+
+    def __getitem__(self, index):
+        return asyncio.run(self.getitem(index))
+
+    async def getitems(self, indices):
+        # initialize a bunch of everstore read operations
+        # wait in the end to reduce overhead
+        # very helpful if io is latency bounded
+
+        max_concurrency = 32
+        sem = asyncio.Semaphore(max_concurrency)
+
+        async def controlled_getitem(index):
+            async with sem:
+                return await self.getitem(index)
+
+        coroutines = []
+        for index in indices:
+            coroutines.append(controlled_getitem(index))
+        results = await asyncio.gather(*coroutines)
+        return results
+
+    def __getitems__(self, indices):
+        return asyncio.run(self.getitems(indices))
+
+    def collater(self, samples):
+        """
+        If we are doing batch sampling, then pick the right collater to use.
+
+        Otherwise we assume all collaters are the same.
+        """
+        if len(samples) == 0:
+            return None
+        if "full_id" in samples[0]:
+            _, key = self._map_index(samples[0]["full_id"])
+            try:
+                batch = self.datasets[key].collater(samples)
+            except Exception:
+                print(f"Collating failed for key {key}", flush=True)
+                raise
+            return batch
+        else:
+            # Subclasses may override __getitem__ to not specify full_id
+            return list(self.datasets.values())[0].collater(samples)
+
+    def num_tokens(self, index: int):
+        index, key = self._map_index(index)
+        return self.datasets[key].num_tokens(index)
+
+    def size(self, index: int):
+        index, key = self._map_index(index)
+        return self.datasets[key].size(index)
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return False
+
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        logger.info(f"setting epoch of multi_corpus_dataset to {epoch}")
+        self.epoch = epoch
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @property
+    def supports_fetch_outside_dataloader(self):
+        return all(
+            self.datasets[key].supports_fetch_outside_dataloader
+            for key in self.datasets
+        )
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+    ):
+        if not self.batch_sample:
+            return super().batch_by_size(
+                indices, max_tokens, max_sentences, required_batch_size_multiple
+            )
+
+        dataset_indices = {key: [] for key in self.datasets}
+        for i in indices:
+            _, key = self._map_index(i)
+            dataset_indices[key].append(i)
+
+        batches = []
+        for key in dataset_indices:
+            cur_batches = super().batch_by_size(
+                np.array(dataset_indices[key], dtype=np.int64),
+                max_tokens,
+                max_sentences,
+                required_batch_size_multiple,
+            )
+            logger.info(f"Created {len(cur_batches)} batches for dataset {key}")
+            batches += cur_batches
+
+        # If this dataset is used in a distributed training setup,
+        # then shuffle such that the order is seeded by the distributed rank
+        # as well
+        if self.distributed_rank is not None:
+            with data_utils.numpy_seed(self.seed, self.epoch, self.distributed_rank):
+                np.random.shuffle(batches)
+        return batches
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
new file mode 100644
index 0000000000..e2e9fdf004
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+from typing import Callable, Dict, List
+
+import numpy as np
+
+from . import FairseqDataset
+
+
+def uniform_sampler(x):
+    # Sample from uniform distribution
+    return np.random.choice(x, 1).item()
+
+
+class MultiCorpusSampledDataset(FairseqDataset):
+    """
+    Stores multiple instances of FairseqDataset together and in every iteration
+    creates a batch by first sampling a dataset according to a specified
+    probability distribution and then getting instances from that dataset.
+
+    Args:
+        datasets: an OrderedDict of FairseqDataset instances.
+        sampling_func: A function for sampling over list of dataset keys.
+            The default strategy is to sample uniformly.
+    """
+
+    def __init__(
+        self,
+        datasets: Dict[str, FairseqDataset],
+        sampling_func: Callable[[List], int] = None,
+    ):
+        super().__init__()
+        assert isinstance(datasets, OrderedDict)
+        self.datasets = datasets
+        if sampling_func is None:
+            sampling_func = uniform_sampler
+        self.sampling_func = sampling_func
+
+        self.total_num_instances = 0
+        for _, dataset in datasets.items():
+            assert isinstance(dataset, FairseqDataset)
+            self.total_num_instances += len(dataset)
+
+        self._ordered_indices = None
+
+    def __len__(self):
+        """
+        Length of this dataset is the sum of individual datasets
+        """
+        return self.total_num_instances
+
+    def ordered_indices(self):
+        """
+        Ordered indices for batching. Here we call the underlying
+        dataset's ordered_indices() so that we get the same random ordering
+        as we would have from using the underlying dataset directly.
+        """
+        if self._ordered_indices is None:
+            self._ordered_indices = OrderedDict(
+                [
+                    (key, dataset.ordered_indices())
+                    for key, dataset in self.datasets.items()
+                ]
+            )
+        return np.arange(len(self))
+
+    def _map_index_to_dataset(self, key: int, index: int):
+        """
+        Different underlying datasets have different lengths. In order to ensure
+        we are not accessing an index outside the range of the current dataset
+        size, we wrap around. This function should be called after we have
+        created an ordering for this and all underlying datasets.
+        """
+        assert (
+            self._ordered_indices is not None
+        ), "Must call MultiCorpusSampledDataset.ordered_indices() first"
+        mapped_index = index % len(self.datasets[key])
+        return self._ordered_indices[key][mapped_index]
+
+    def __getitem__(self, index: int):
+        """
+        Get the item associated with index from each underlying dataset.
+        Since index is in the range of [0, TotalNumInstances], we need to
+        map the index to the dataset before retrieving the item.
+        """
+        return OrderedDict(
+            [
+                (key, dataset[self._map_index_to_dataset(key, index)])
+                for key, dataset in self.datasets.items()
+            ]
+        )
+
+    def collater(self, samples: List[Dict]):
+        """
+        Generate a mini-batch for this dataset.
+        To convert this into a regular mini-batch we use the following
+        logic:
+            1. Select a dataset using the specified probability distribution.
+            2. Call the collater function of the selected dataset.
+        """
+        if len(samples) == 0:
+            return None
+
+        selected_key = self.sampling_func(list(self.datasets.keys()))
+        selected_samples = [sample[selected_key] for sample in samples]
+        return self.datasets[selected_key].collater(selected_samples)
+
+    def num_tokens(self, index: int):
+        """
+        Return an example's length (number of tokens), used for batching. Here
+        we return the max across all examples at index across all underlying
+        datasets.
+        """
+        return max(
+            dataset.num_tokens(self._map_index_to_dataset(key, index))
+            for key, dataset in self.datasets.items()
+        )
+
+    def size(self, index: int):
+        """
+        Return an example's size as a float or tuple. Here we return the max
+        across all underlying datasets. This value is used when filtering a
+        dataset with max-positions.
+        """
+        return max(
+            dataset.size(self._map_index_to_dataset(key, index))
+            for key, dataset in self.datasets.items()
+        )
+
+    @property
+    def supports_prefetch(self):
+        return all(
+            getattr(dataset, "supports_prefetch", False)
+            for dataset in self.datasets.values()
+        )
+
+    def prefetch(self, indices):
+        for key, dataset in self.datasets.items():
+            dataset.prefetch(
+                [self._map_index_to_dataset(key, index) for index in indices]
+            )
+
+    @property
+    def supports_fetch_outside_dataloader(self):
+        return all(
+            self.datasets[key].supports_fetch_outside_dataloader
+            for key in self.datasets
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
new file mode 100644
index 0000000000..6264236915
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
new file mode 100644
index 0000000000..876dfcec36
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
@@ -0,0 +1,1156 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import json
+import logging
+import math
+import os
+from collections import OrderedDict, defaultdict
+from argparse import ArgumentError
+
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    ConcatDataset,
+    Dictionary,
+    LanguagePairDataset,
+    PrependTokenDataset,
+    SampledMultiDataset,
+    SampledMultiEpochDataset,
+    StripTokenDataset,
+    TransformEosLangPairDataset,
+    TruncateDataset,
+    data_utils,
+    indexed_dataset,
+)
+from fairseq.data.multilingual.multilingual_utils import (
+    EncoderLangtok,
+    LangTokSpec,
+    LangTokStyle,
+    augment_dictionary,
+    get_lang_tok,
+)
+from fairseq.data.multilingual.sampled_multi_dataset import CollateFormat
+from fairseq.file_io import PathManager
+from fairseq.utils import FileContentsAction, csv_str_list, eval_str_dict
+
+
+logger = logging.getLogger(__name__)
+
+SRC_DICT_NAME = "src"
+TGT_DICT_NAME = "tgt"
+
+
+def _lang_id(dic: Dictionary, lang: str):
+    """Return language ID index."""
+    idx = dic.index(lang)
+    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang)
+    return idx
+
+
+def load_sampling_weights(from_file):
+    with open(from_file) as f:
+        weights = json.load(f)
+    return weights
+
+
+class MultilingualDatasetManager(object):
+    def __init__(self, args, lang_pairs, langs, dicts, sampling_method):
+        super().__init__()
+        self.args = args
+        self.seed = args.seed
+        self.lang_pairs = lang_pairs
+        self.extra_lang_pairs = (
+            list({p for _, v in args.extra_lang_pairs.items() for p in v.split(",")})
+            if args.extra_lang_pairs
+            else []
+        )
+        self.src_langs = {
+            p.split("-")[0] for p in args.lang_pairs + self.extra_lang_pairs
+        }
+        self.tgt_langs = {
+            p.split("-")[1] for p in args.lang_pairs + self.extra_lang_pairs
+        }
+        self.langs = langs
+        self.dicts = dicts
+        self.lang_dict = self.create_lang_dictionary(self.langs)
+        self.sampling_method = sampling_method
+        self.sampling_scheduler = None
+        self._has_sharded_data = False
+        self._num_shards_dict = {}
+        self._training_data_sizes = defaultdict(lambda: {})
+
+    @classmethod
+    def setup_data_manager(cls, args, lang_pairs, langs, dicts, sampling_method):
+        return MultilingualDatasetManager(
+            args, lang_pairs, langs, dicts, sampling_method
+        )
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument(
+            "data",
+            help="colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner",
+            action=FileContentsAction,
+        )
+        parser.add_argument(
+            "--langs",
+            default=None,
+            type=csv_str_list,
+            help="a list of languages comma sperated languages which can appear in lang-pairs; "
+            "note that the ordering determines language token IDs",
+        )
+        parser.add_argument(
+            "--lang-dict",
+            default=None,
+            type=str,
+            help="an external file which contains a list of "
+            "languages which can appear in lang-pairs; "
+            "note that the ordering determines language token IDs; "
+            "--langs and --lang-dict are two exclusive options",
+        )
+        parser.add_argument(
+            "--source-dict",
+            default=None,
+            type=str,
+            help="path to source dictionary; if specified it will override per language dictionary loading",
+        )
+        parser.add_argument(
+            "--target-dict",
+            default=None,
+            type=str,
+            help="path to target dictionary; if specified it will override per language dictionary loading",
+        )
+        parser.add_argument(
+            "--lang-tok-style",
+            default=LangTokStyle.multilingual.value,
+            type=str,
+            choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value],
+            help="language token styles",
+        )
+
+        parser.add_argument(
+            "--load-alignments",
+            action="store_true",
+            help="load the binarized alignments",
+        )
+        parser.add_argument(
+            "--left-pad-source",
+            default="True",
+            type=str,
+            metavar="BOOL",
+            help="pad the source on the left",
+        )
+        parser.add_argument(
+            "--left-pad-target",
+            default="False",
+            type=str,
+            metavar="BOOL",
+            help="pad the target on the left",
+        )
+        try:
+            parser.add_argument(
+                "--max-source-positions",
+                default=1024,
+                type=int,
+                metavar="N",
+                help="max number of tokens in the source sequence",
+            )
+            parser.add_argument(
+                "--max-target-positions",
+                default=1024,
+                type=int,
+                metavar="N",
+                help="max number of tokens in the target sequence",
+            )
+        except ArgumentError:
+            # this might have already been defined. Once we transition this to hydra it should be fine to add it here.
+            pass
+        parser.add_argument(
+            "--upsample-primary",
+            default=1,
+            type=int,
+            help="amount to upsample primary dataset",
+        )
+        parser.add_argument(
+            "--truncate-source",
+            action="store_true",
+            default=False,
+            help="truncate source to max-source-positions",
+        )
+        parser.add_argument(
+            "--encoder-langtok",
+            default=None,
+            type=str,
+            choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value],
+            metavar="SRCTGT",
+            help="prepend to the beginning of source sentence the source or target "
+            "language token. (src/tgt)",
+        )
+        parser.add_argument(
+            "--decoder-langtok",
+            action="store_true",
+            help="prepend to the beginning of target sentence the target language token",
+        )
+        parser.add_argument(
+            "--lang-tok-replacing-bos-eos", action="store_true", default=False
+        )
+        parser.add_argument(
+            "--enable-lang-ids",
+            default=False,
+            action="store_true",
+            help="whether to include language IDs in samples",
+        )
+        parser.add_argument(
+            "--enable-reservsed-directions-shared-datasets",
+            default=False,
+            action="store_true",
+            help="whether to allow datasets be used in reversed directions",
+        )
+
+        parser.add_argument(
+            "--extra-data",
+            help='a dictionary of data name to this path, \
+                            e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
+            type=lambda uf: eval_str_dict(uf, type=str),
+            default=None,
+        )
+        parser.add_argument(
+            "--extra-lang-pairs",
+            help='a dictionary of data name to the language pairs they serve, \
+                            e.g. {"mined": comma-separated-lang-pairs, "denoised":  comma-separated-lang-pairs}',
+            type=lambda uf: eval_str_dict(uf, type=str),
+            default=None,
+        )
+        parser.add_argument(
+            "--fixed-dictionary",
+            help="Fixed dictionary to use with model path",
+            default=None,
+            type=str,
+        )
+        parser.add_argument(
+            "--langtoks-specs",
+            help='a list of comma separated data types that a set of language tokens to be specialized for, \
+                            e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
+                            distinguish languages in different training data types. If not specified, default language \
+                            tokens per languages will be added',
+            default=LangTokSpec.main.value,
+            type=csv_str_list,
+        )
+        parser.add_argument(
+            "--langtoks",
+            help='a dictionary of how to add language tokens, \
+                            e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
+                            ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
+            default=None,
+            type=lambda uf: eval_str_dict(uf, type=str),
+        )
+        parser.add_argument(
+            "--sampling-weights-from-file",
+            help='a file contain a python dictionary of how to sample data sets, \
+                                e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
+                                    "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
+            default=None,
+            type=str,
+        )
+        parser.add_argument(
+            "--sampling-weights",
+            help='a dictionary of how to sample data sets, \
+                            e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
+                                   "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
+            default=None,
+            type=lambda uf: eval_str_dict(uf, type=str),
+        )
+        parser.add_argument(
+            "--virtual-epoch-size",
+            default=None,
+            type=int,
+            help="virtual epoch size to speed up data loading",
+        )
+        parser.add_argument(
+            "--virtual-data-size",
+            default=None,
+            type=int,
+            help="virtual data size of the whole joint dataset to speed"
+            "up data loading and have specific dynamic sampling strategy interval",
+        )
+
+    @classmethod
+    def load_langs(cls, args, **kwargs):
+        if args.lang_dict and args.langs:
+            raise ValueError("--langs and --lang-dict can not both be specified")
+        if args.lang_dict is None and args.langs is None:
+            logger.warning(
+                "External language dictionary is not provided; "
+                "use lang-pairs to infer the set of supported languages. "
+                "The language ordering is not stable which might cause "
+                "misalignment in pretraining and finetuning."
+            )
+            # infer from lang_pairs as it is
+            langs = list(
+                {x for lang_pair in args.lang_pairs for x in lang_pair.split("-")}
+            )
+            langs = sorted(langs)
+            logger.info(f"inferred language list: {langs}")
+        elif args.lang_dict:
+            with open(
+                PathManager.get_local_path(args.lang_dict), "r", encoding="utf-8"
+            ) as f:
+                langs = [lang.strip() for lang in f.readlines() if lang.strip()]
+                logger.info(
+                    f"loaded language list from {args.lang_dict} as they are ordered in file"
+                )
+        elif args.langs:
+            langs = args.langs
+            logger.info(
+                f"parsed the language list as they are ordered in the option: {langs}"
+            )
+        return langs
+
+    def has_sharded_data(self, split):
+        return self._has_sharded_data and split == getattr(
+            self.args, "train_subset", None
+        )
+
+    def _shared_collater(self):
+        return not (self.args.extra_data and "mono_dae" in self.args.extra_data) and (
+            not self.args.lang_tok_replacing_bos_eos
+        )
+
+    def estimate_global_pass_epoch(self, epoch):
+        if self.args.virtual_epoch_size is None or self.args.virtual_data_size is None:
+            return None
+        # one epoch more for remaining data in each shard
+        virtual_epochs_per_shard = math.ceil(
+            self.args.virtual_data_size / self.args.virtual_epoch_size
+        )
+        # note that fairseq epoch / shard_epoch starts from 1
+        shard_epoch = (epoch - 1) // virtual_epochs_per_shard + 1
+        return shard_epoch
+
+    @classmethod
+    def prepare(cls, load_dictionary, args, **kargs):
+        args.left_pad_source = utils.eval_bool(args.left_pad_source)
+        args.left_pad_target = utils.eval_bool(args.left_pad_target)
+
+        if not hasattr(args, "shuffle_instance"):
+            args.shuffle_instance = False
+        if args.langtoks is None:
+            args.langtoks = {}
+        if "main" not in args.langtoks:
+            src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None
+            tgt_langtok_spec = "tgt" if args.decoder_langtok else None
+            args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec)
+
+        def check_langs(langs, pairs):
+            messages = []
+            for src, tgt in pairs:
+                if src not in langs or tgt not in langs:
+                    messages.append(
+                        f"language pair {src}-{tgt} contains languages "
+                        "that are not in the language dictionary"
+                    )
+            if len(messages) > 0:
+                raise ValueError(" ".join(messages) + f"; langs: {langs}")
+
+        if args.lang_pairs is None:
+            raise ValueError(
+                "--lang-pairs is required. List all the language pairs in the training objective."
+            )
+        if isinstance(args.lang_pairs, str):
+            args.lang_pairs = args.lang_pairs.split(",")
+        if args.source_lang is not None or args.target_lang is not None:
+            training = False
+        else:
+            training = True
+        language_list = cls.load_langs(args, **kargs)
+        check_langs(
+            language_list,
+            (
+                [p.split("-") for p in args.lang_pairs]
+                if training
+                else [(args.source_lang, args.target_lang)]
+            ),
+        )
+
+        def load_dictionary_and_postproc(path):
+            d = load_dictionary(path)
+            augment_dictionary(
+                dictionary=d,
+                language_list=language_list,
+                lang_tok_style=args.lang_tok_style,
+                langtoks_specs=args.langtoks_specs,
+                extra_data=args.extra_data,
+            )
+            return d
+
+        dicts = cls.load_all_dictionaries(
+            args, language_list, load_dictionary_and_postproc, training
+        )
+        return language_list, dicts, training
+
+    @classmethod
+    def load_all_dictionaries(cls, args, language_list, load_dictionary, training):
+        dicts = OrderedDict()
+        if args.source_dict is not None:
+            dicts[SRC_DICT_NAME] = load_dictionary(args.source_dict)
+        if args.target_dict is not None:
+            dicts[TGT_DICT_NAME] = load_dictionary(args.target_dict)
+
+        if training:
+            extra_lang_pairs = (
+                list(
+                    {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")}
+                )
+                if args.extra_lang_pairs
+                else []
+            )
+            src_langs_to_load_dicts = sorted(
+                {p.split("-")[0] for p in (args.lang_pairs + extra_lang_pairs)}
+            )
+            tgt_langs_to_load_dicts = sorted(
+                {p.split("-")[1] for p in (args.lang_pairs + extra_lang_pairs)}
+            )
+        else:
+            src_langs_to_load_dicts = [args.source_lang]
+            tgt_langs_to_load_dicts = [args.target_lang]
+
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+
+        def load_dicts(langs_to_load_dicts):
+            for lang in langs_to_load_dicts:
+                dicts[lang] = load_dictionary(
+                    os.path.join(paths[0], "dict.{}.txt".format(lang))
+                )
+            if len(dicts) > 0:
+                dict0 = next(iter(dicts.values()))
+                assert dicts[lang].pad() == dict0.pad()
+                assert dicts[lang].eos() == dict0.eos()
+                assert dicts[lang].unk() == dict0.unk()
+            logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang])))
+
+        if args.fixed_dictionary is not None:
+            fixed_dict = load_dictionary(args.fixed_dictionary)
+            dicts = {
+                lang: fixed_dict
+                for lang in src_langs_to_load_dicts + tgt_langs_to_load_dicts
+            }
+        else:
+            if args.source_dict is None:
+                load_dicts(src_langs_to_load_dicts)
+            if args.target_dict is None:
+                load_dicts(tgt_langs_to_load_dicts)
+        return dicts
+
+    def get_source_dictionary(self, lang):
+        if self.args.source_dict is not None:
+            return self.dicts[SRC_DICT_NAME]
+        else:
+            return self.dicts[lang]
+
+    def get_target_dictionary(self, lang):
+        if self.args.target_dict is not None:
+            return self.dicts[TGT_DICT_NAME]
+        else:
+            return self.dicts[lang]
+
+    @classmethod
+    def create_lang_dictionary(cls, langs):
+        unk = "<unk>"
+        # hack to remove symbols other than unk as they are not needed by lang dict
+        lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk)
+        for lang in langs:
+            lang_dict.add_symbol(lang)
+        return lang_dict
+
+    @classmethod
+    def get_langtok_index(cls, lang_tok, dic):
+        idx = dic.index(lang_tok)
+        assert (
+            idx != dic.unk_index
+        ), "cannot find language token {} in the dictionary".format(lang_tok)
+        return idx
+
+    def get_encoder_langtok(self, src_lang, tgt_lang, spec=None):
+        if spec is None:
+            return None
+        if spec and spec.startswith("src"):
+            if src_lang is None:
+                return None
+            langtok = get_lang_tok(
+                lang=src_lang, lang_tok_style=self.args.lang_tok_style, spec=spec
+            )
+        else:
+            if tgt_lang is None:
+                return None
+            langtok = get_lang_tok(
+                lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec
+            )
+        return self.get_langtok_index(
+            langtok,
+            self.get_source_dictionary(src_lang)
+            if src_lang
+            else self.get_target_dictionary(tgt_lang),
+        )
+
+    def get_decoder_langtok(self, tgt_lang, spec=None):
+        if spec is None:
+            return None
+        langtok = get_lang_tok(
+            lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec
+        )
+        return self.get_langtok_index(langtok, self.get_target_dictionary(tgt_lang))
+
+    @classmethod
+    def load_data(cls, path, vdict, impl):
+        dataset = data_utils.load_indexed_dataset(path, vdict, impl)
+        return dataset
+
+    @classmethod
+    def split_exists(cls, split, src, tgt, lang, data_path, dataset_impl):
+        filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang))
+        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
+
+    def load_lang_dataset(
+        self,
+        data_path,
+        split,
+        src,
+        src_dict,
+        tgt,
+        tgt_dict,
+        combine,
+        dataset_impl,
+        upsample_primary,
+        max_source_positions,
+        prepend_bos=False,
+        load_alignments=False,
+        truncate_source=False,
+    ):
+
+        src_datasets = []
+        tgt_datasets = []
+
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else "")
+
+            # infer langcode
+            if self.split_exists(split_k, src, tgt, src, data_path, dataset_impl):
+                prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt))
+            elif self.split_exists(split_k, tgt, src, src, data_path, dataset_impl):
+                prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src))
+            else:
+                if k > 0:
+                    break
+                else:
+                    logger.error(
+                        f"Dataset not found: {data_path}, {split_k}, {src}, {tgt}"
+                    )
+                    raise FileNotFoundError(
+                        "Dataset not found: {} ({})".format(split, data_path)
+                    )
+
+            src_dataset = self.load_data(prefix + src, src_dict, dataset_impl)
+            if truncate_source:
+                src_dataset = AppendTokenDataset(
+                    TruncateDataset(
+                        StripTokenDataset(src_dataset, src_dict.eos()),
+                        max_source_positions - 1,
+                    ),
+                    src_dict.eos(),
+                )
+            src_datasets.append(src_dataset)
+            tgt_datasets.append(self.load_data(prefix + tgt, tgt_dict, dataset_impl))
+
+            logger.info(
+                "{} {} {}-{} {} examples".format(
+                    data_path, split_k, src, tgt, len(src_datasets[-1])
+                )
+            )
+
+            if not combine:
+                break
+
+        assert len(src_datasets) == len(tgt_datasets)
+
+        if len(src_datasets) == 1:
+            src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0]
+        else:
+            sample_ratios = [1] * len(src_datasets)
+            sample_ratios[0] = upsample_primary
+            src_dataset = ConcatDataset(src_datasets, sample_ratios)
+            tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
+
+        if prepend_bos:
+            assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
+            src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
+            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
+
+        align_dataset = None
+        if load_alignments:
+            align_path = os.path.join(
+                data_path, "{}.align.{}-{}".format(split, src, tgt)
+            )
+            if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
+                align_dataset = data_utils.load_indexed_dataset(
+                    align_path, None, dataset_impl
+                )
+
+        return src_dataset, tgt_dataset, align_dataset
+
+    def load_langpair_dataset(
+        self,
+        data_path,
+        split,
+        src,
+        src_dict,
+        tgt,
+        tgt_dict,
+        combine,
+        dataset_impl,
+        upsample_primary,
+        left_pad_source,
+        left_pad_target,
+        max_source_positions,
+        max_target_positions,
+        prepend_bos=False,
+        load_alignments=False,
+        truncate_source=False,
+        src_dataset_transform_func=lambda dataset: dataset,
+        tgt_dataset_transform_func=lambda dataset: dataset,
+        src_lang_id=None,
+        tgt_lang_id=None,
+        langpairs_sharing_datasets=None,
+    ):
+        norm_direction = "-".join(sorted([src, tgt]))
+        if langpairs_sharing_datasets is not None:
+            src_dataset = langpairs_sharing_datasets.get(
+                (data_path, split, norm_direction, src), "NotInCache"
+            )
+            tgt_dataset = langpairs_sharing_datasets.get(
+                (data_path, split, norm_direction, tgt), "NotInCache"
+            )
+            align_dataset = langpairs_sharing_datasets.get(
+                (data_path, split, norm_direction, src, tgt), "NotInCache"
+            )
+
+        # a hack: any one is not in cache, we need to reload them
+        if (
+            langpairs_sharing_datasets is None
+            or src_dataset == "NotInCache"
+            or tgt_dataset == "NotInCache"
+            or align_dataset == "NotInCache"
+            or split != getattr(self.args, "train_subset", None)
+        ):
+            # source and target datasets can be reused in reversed directions to save memory
+            # reversed directions of valid and test data will not share source and target datasets
+            src_dataset, tgt_dataset, align_dataset = self.load_lang_dataset(
+                data_path,
+                split,
+                src,
+                src_dict,
+                tgt,
+                tgt_dict,
+                combine,
+                dataset_impl,
+                upsample_primary,
+                max_source_positions=max_source_positions,
+                prepend_bos=prepend_bos,
+                load_alignments=load_alignments,
+                truncate_source=truncate_source,
+            )
+            src_dataset = src_dataset_transform_func(src_dataset)
+            tgt_dataset = tgt_dataset_transform_func(tgt_dataset)
+            if langpairs_sharing_datasets is not None:
+                langpairs_sharing_datasets[
+                    (data_path, split, norm_direction, src)
+                ] = src_dataset
+                langpairs_sharing_datasets[
+                    (data_path, split, norm_direction, tgt)
+                ] = tgt_dataset
+                langpairs_sharing_datasets[
+                    (data_path, split, norm_direction, src, tgt)
+                ] = align_dataset
+                if align_dataset is None:
+                    # no align data so flag the reverse direction as well in sharing
+                    langpairs_sharing_datasets[
+                        (data_path, split, norm_direction, tgt, src)
+                    ] = align_dataset
+        else:
+            logger.info(
+                f"Reusing source and target datasets of [{split}] {tgt}-{src} for reversed direction: "
+                f"[{split}] {src}-{tgt}: src length={len(src_dataset)}; tgt length={len(tgt_dataset)}"
+            )
+
+        return LanguagePairDataset(
+            src_dataset,
+            src_dataset.sizes,
+            src_dict,
+            tgt_dataset,
+            tgt_dataset.sizes if tgt_dataset is not None else None,
+            tgt_dict,
+            left_pad_source=left_pad_source,
+            left_pad_target=left_pad_target,
+            align_dataset=align_dataset,
+            src_lang_id=src_lang_id,
+            tgt_lang_id=tgt_lang_id,
+        )
+
+    def src_dataset_tranform_func(self, src_lang, tgt_lang, dataset, spec=None):
+        if self.args.lang_tok_replacing_bos_eos:
+            # it is handled by self.alter_dataset_langtok
+            # TODO: Unifiy with alter_dataset_langtok
+            return dataset
+        if spec is None:
+            return dataset
+        tok = self.get_encoder_langtok(src_lang, tgt_lang, spec)
+        if tok:
+            return PrependTokenDataset(dataset, tok)
+        return dataset
+
+    def tgt_dataset_tranform_func(self, source_lang, target_lang, dataset, spec=None):
+        if dataset is None:
+            # note that target dataset can be None during inference time
+            return None
+        if self.args.lang_tok_replacing_bos_eos:
+            # TODO: Unifiy with alter_dataset_langtok
+            # It is handled by self.alter_dataset_langtok.
+            # The complication in self.alter_dataset_langtok
+            # makes a unified framework difficult.
+            return dataset
+        # if not self.args.decoder_langtok:
+        if not spec:
+            return dataset
+        tok = self.get_decoder_langtok(target_lang, spec)
+        if tok:
+            return PrependTokenDataset(dataset, tok)
+        return dataset
+
+    def alter_dataset_langtok(
+        self,
+        lang_pair_dataset,
+        src_eos=None,
+        src_lang=None,
+        tgt_eos=None,
+        tgt_lang=None,
+        src_langtok_spec=None,
+        tgt_langtok_spec=None,
+    ):
+        if src_langtok_spec is None and tgt_langtok_spec is None:
+            return lang_pair_dataset
+
+        new_src_eos = None
+        if (
+            src_langtok_spec is not None
+            and src_eos is not None
+            and (src_lang is not None or tgt_lang is not None)
+        ):
+            new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang, src_langtok_spec)
+        else:
+            src_eos = None
+
+        new_tgt_bos = None
+        if tgt_langtok_spec and tgt_eos is not None and tgt_lang is not None:
+            new_tgt_bos = self.get_decoder_langtok(tgt_lang, tgt_langtok_spec)
+        else:
+            tgt_eos = None
+
+        return TransformEosLangPairDataset(
+            lang_pair_dataset,
+            src_eos=src_eos,
+            new_src_eos=new_src_eos,
+            tgt_bos=tgt_eos,
+            new_tgt_bos=new_tgt_bos,
+        )
+
+    def load_a_dataset(
+        self,
+        split,
+        data_path,
+        src,
+        src_dict,
+        tgt,
+        tgt_dict,
+        combine,
+        prepend_bos=False,
+        langpairs_sharing_datasets=None,
+        data_category=None,
+        **extra_kwargs,
+    ):
+        dataset_impl = self.args.dataset_impl
+        upsample_primary = self.args.upsample_primary
+        left_pad_source = self.args.left_pad_source
+        left_pad_target = self.args.left_pad_target
+        max_source_positions = self.args.max_source_positions
+        max_target_positions = self.args.max_target_positions
+        load_alignments = self.args.load_alignments
+        truncate_source = self.args.truncate_source
+        src_dataset_transform_func = self.src_dataset_tranform_func
+        tgt_dataset_transform_func = self.tgt_dataset_tranform_func
+        enable_lang_ids = self.args.enable_lang_ids
+        lang_dictionary = self.lang_dict
+        src_langtok_spec, tgt_langtok_spec = extra_kwargs["langtok_spec"]
+
+        src_langtok = self.get_encoder_langtok(src, tgt, src_langtok_spec)
+        tgt_langtok = self.get_decoder_langtok(tgt, tgt_langtok_spec)
+        logger.info(
+            f"{data_category}:{src}-{tgt} src_langtok: {src_langtok}; tgt_langtok: {tgt_langtok}"
+        )
+
+        langpair_ds = self.load_langpair_dataset(
+            data_path,
+            split,
+            src,
+            src_dict,
+            tgt,
+            tgt_dict,
+            combine,
+            dataset_impl,
+            upsample_primary,
+            left_pad_source,
+            left_pad_target,
+            max_source_positions,
+            max_target_positions,
+            prepend_bos,
+            load_alignments,
+            truncate_source,
+            src_dataset_transform_func=lambda dataset: src_dataset_transform_func(
+                src, tgt, dataset, src_langtok_spec
+            ),
+            tgt_dataset_transform_func=lambda dataset: tgt_dataset_transform_func(
+                src, tgt, dataset, tgt_langtok_spec
+            ),
+            src_lang_id=_lang_id(lang_dictionary, src)
+            if enable_lang_ids and lang_dictionary is not None
+            else None,
+            tgt_lang_id=_lang_id(lang_dictionary, tgt)
+            if enable_lang_ids and lang_dictionary is not None
+            else None,
+            langpairs_sharing_datasets=langpairs_sharing_datasets,
+        )
+        # TODO: handle modified lang toks for mined data and dae data
+        if self.args.lang_tok_replacing_bos_eos:
+            ds = self.alter_dataset_langtok(
+                langpair_ds,
+                src_eos=self.get_source_dictionary(src).eos()
+                if src
+                else self.get_target_dictionary(tgt).eos(),
+                src_lang=src,
+                tgt_eos=self.get_target_dictionary(tgt).eos(),
+                tgt_lang=tgt,
+                src_langtok_spec=src_langtok_spec,
+                tgt_langtok_spec=tgt_langtok_spec,
+            )
+        else:
+            ds = langpair_ds
+        return ds
+
+    def load_split_langpair_datasets(self, split, data_param_list):
+        datasets = []
+        langpairs_sharing_datasets = (
+            {} if self.args.enable_reservsed_directions_shared_datasets else None
+        )
+        for param in data_param_list:
+            ds = self.load_a_dataset(
+                split=split,
+                langpairs_sharing_datasets=langpairs_sharing_datasets,
+                **param,
+            )
+            datasets.append(ds)
+        return datasets
+
+    def get_data_paths_and_lang_pairs(self, split):
+        datapaths = {"main": self.args.data}
+        lang_pairs = {"main": self.lang_pairs}
+        if split == getattr(self.args, "train_subset", None):
+            # only training data can have extra data and extra language pairs
+            if self.args.extra_data:
+                extra_datapaths = self.args.extra_data
+                datapaths.update(extra_datapaths)
+            if self.args.extra_lang_pairs:
+                extra_lang_pairs = {
+                    k: v.split(",") for k, v in self.args.extra_lang_pairs.items()
+                }
+                lang_pairs.update(extra_lang_pairs)
+        return datapaths, lang_pairs
+
+    @classmethod
+    def get_dataset_key(cls, data_category, src, tgt):
+        return f"{data_category}:{src}-{tgt}"
+
+    @classmethod
+    def _get_shard_num_dict(cls, split, paths):
+        shards = defaultdict(int)
+        for path in paths:
+            files = PathManager.ls(path)
+            directions = set()
+            for f in files:
+                if f.startswith(split) and f.endswith(".idx"):
+                    # idx files of the form "{split}.{src}-{tgt}.{lang}.idx"
+                    direction = f.split(".")[-3]
+                    directions.add(direction)
+            for direction in directions:
+                shards[direction] += 1
+        return shards
+
+    def get_split_num_data_shards(self, split):
+        if split in self._num_shards_dict:
+            return self._num_shards_dict[split]
+        num_shards_dict = {}
+        data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split)
+
+        for data_category, paths in data_paths.items():
+            if data_category not in lang_pairs:
+                continue
+            paths = utils.split_paths(paths)
+            shards_dict = self._get_shard_num_dict(split, paths)
+            lang_dirs = [
+                lang_pair.split("-") for lang_pair in lang_pairs[data_category]
+            ]
+            lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs]
+            for src, tgt in lang_dirs:
+                key = self.get_dataset_key(data_category, src, tgt)
+                if "mono_" in data_category:
+                    # monolingual data requires tgt only
+                    assert src is None or src == tgt, (
+                        f"error: src={src}, "
+                        f"tgt={tgt} for data_category={data_category}"
+                    )
+                    num_shards_dict[key] = shards_dict[tgt]
+                else:
+                    if f"{src}-{tgt}" in shards_dict:
+                        num_shards_dict[key] = shards_dict[f"{src}-{tgt}"]
+                    elif f"{tgt}-{src}" in shards_dict:
+                        # follow the fairseq tradition to use reversed direction data if it is not available
+                        num_shards_dict[key] = shards_dict[f"{tgt}-{src}"]
+        self._num_shards_dict[split] = num_shards_dict
+        logger.info(f"[{split}] num of shards: {num_shards_dict}")
+        return num_shards_dict
+
+    @classmethod
+    def get_shard_id(cls, num_shards, epoch, shard_epoch=None):
+        shard = epoch if shard_epoch is None else shard_epoch
+        shard = (shard - 1) % num_shards
+        return shard
+
+    def get_split_data_path(self, paths, epoch, shard_epoch, num_shards):
+        path = paths[self.get_shard_id(num_shards, epoch, shard_epoch)]
+        return path
+
+    def get_split_data_param_list(self, split, epoch, shard_epoch=None):
+        # TODO: to extend with extra datasets and keys and loop over different shard data paths
+        param_list = []
+        data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split)
+        logger.info(f"langtoks settings: {self.args.langtoks}")
+        split_num_shards_dict = self.get_split_num_data_shards(split)
+        for data_category, paths in data_paths.items():
+            if data_category not in lang_pairs:
+                continue
+            paths = utils.split_paths(paths)
+            assert len(paths) > 0
+            if len(paths) > 1:
+                self._has_sharded_data = True
+            if split != getattr(self.args, "train_subset", None):
+                # if not training data set, use the first shard for valid and test
+                paths = paths[:1]
+
+            if data_category in self.args.langtoks:
+                lang_tok_spec = self.args.langtoks[data_category]
+            else:
+                # default to None
+                lang_tok_spec = (None, None)
+
+            # infer langcode
+            lang_dirs = [
+                lang_pair.split("-") for lang_pair in lang_pairs[data_category]
+            ]
+            lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs]
+            for src, tgt in lang_dirs:
+                assert src is not None or data_category == "mono_dae", (
+                    f"error: src={src}, " f"tgt={tgt} for data_category={data_category}"
+                )
+                # logger.info(f"preparing param for {data_category}: {src} - {tgt}")
+                key = self.get_dataset_key(data_category, src, tgt)
+                data_path = self.get_split_data_path(
+                    paths, epoch, shard_epoch, split_num_shards_dict[key]
+                )
+                param_list.append(
+                    {
+                        "key": key,
+                        "data_path": data_path,
+                        "split": split,
+                        "src": src,
+                        "src_dict": self.get_source_dictionary(src)
+                        if src and data_category != "mono_dae"
+                        else None,
+                        "tgt": tgt,
+                        "tgt_dict": self.get_target_dictionary(tgt),
+                        "data_category": data_category,
+                        "langtok_spec": lang_tok_spec,
+                    }
+                )
+        return param_list
+
+    def get_train_dataset_sizes(
+        self, data_param_list, datasets, epoch, shard_epoch=None
+    ):
+        num_shards = [
+            self.get_split_num_data_shards(param["split"])[param["key"]]
+            for param in data_param_list
+        ]
+        data_sizes = []
+        for (key, d), num_shard in zip(datasets, num_shards):
+            my_data_sizes = self._training_data_sizes[key]
+            shard_ind = self.get_shard_id(num_shard, epoch, shard_epoch)
+            if shard_ind not in my_data_sizes:
+                my_data_sizes[shard_ind] = len(d)
+            known_size = max(my_data_sizes.values())
+            data_sizes.append(
+                # If we don't know the data size of the shard yet,
+                # use the the max known data size to approximate.
+                # Note that we preprocess shards by a designated shard size
+                # and put any remaining data at the end into the last shard so
+                # the max shard size approximation is almost correct before loading
+                # the last shard; after loading the last shard, it will have the
+                # exact data sizes of the whole data size.
+                (key, sum(my_data_sizes.get(i, known_size) for i in range(num_shard)))
+            )
+        logger.info(
+            f"estimated total data sizes of all shards used in sampling ratios: {data_sizes}. "
+            "Note that if the data a shard has not been loaded yet, use the max known data size to approximate"
+        )
+        return [s for _, s in data_sizes]
+
+    def get_train_sampling_ratios(
+        self, data_param_list, datasets, epoch=1, shard_epoch=None
+    ):
+        data_sizes = self.get_train_dataset_sizes(
+            data_param_list, datasets, epoch, shard_epoch
+        )
+        sampling_func = self.sampling_method.sampling_method_selector()
+        sample_ratios = sampling_func(data_sizes) if sampling_func is not None else None
+        return sample_ratios
+
+    def get_sampling_ratios(self, data_param_list, datasets, epoch, shard_epoch=None):
+        if self.args.sampling_weights_from_file:
+            weights = load_sampling_weights(self.args.sampling_weights_from_file)
+            sample_ratios = [weights[k] for k, _ in datasets]
+            logger.info(
+                "| ignoring --sampling-weights when loadding sampling weights "
+                f"from file {self.args.sampling_weights_from_file}"
+            )
+        elif self.args.sampling_weights:
+            sample_ratios = [self.args.sampling_weights[k] for k, _ in datasets]
+        else:
+            sample_ratios = self.get_train_sampling_ratios(
+                data_param_list, datasets, epoch, shard_epoch
+            )
+
+        if sample_ratios is not None:
+            logger.info(
+                "| Upsample ratios: {}".format(
+                    list(zip(map(lambda x: x["key"], data_param_list), sample_ratios))
+                )
+            )
+            assert len(sample_ratios) == len(datasets)
+        return sample_ratios
+
+    def load_split_datasets(
+        self, split, training, epoch=1, combine=False, shard_epoch=None, **kwargs
+    ):
+        data_param_list = self.get_split_data_param_list(
+            split, epoch, shard_epoch=shard_epoch
+        )
+        langpairs_sharing_datasets = (
+            {} if self.args.enable_reservsed_directions_shared_datasets else None
+        )
+        datasets = [
+            (
+                param["key"],
+                self.load_a_dataset(
+                    combine=combine,
+                    langpairs_sharing_datasets=langpairs_sharing_datasets,
+                    **param,
+                ),
+            )
+            for param in data_param_list
+        ]
+        return datasets, data_param_list
+
+    def load_into_concat_dataset(self, split, datasets, data_param_list):
+        if self.args.lang_tok_replacing_bos_eos:
+            # TODO: to investigate why TransformEosLangPairDataset doesn't work with ConcatDataset
+            return SampledMultiDataset(
+                OrderedDict(datasets),
+                sampling_ratios=None,
+                eval_key=None,
+                collate_format=CollateFormat.single,
+                virtual_size=None,
+                split=split,
+            )
+        return ConcatDataset([d for _, d in datasets])
+
+    def load_sampled_multi_epoch_dataset(
+        self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs
+    ):
+        datasets, data_param_list = self.load_split_datasets(
+            split, training, epoch, combine, shard_epoch=shard_epoch, **kwargs
+        )
+        if training and split == getattr(self.args, "train_subset", None):
+            sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch)
+            return SampledMultiEpochDataset(
+                OrderedDict(datasets),
+                epoch=epoch,
+                shard_epoch=shard_epoch,
+                # valid and test datasets will be degenerate to concating datasets:
+                sampling_ratios=sample_ratios,
+                eval_key=None,
+                collate_format=CollateFormat.single,
+                virtual_size=self.args.virtual_data_size,
+                split=split,
+                virtual_epoch_size=self.args.virtual_epoch_size,
+                # if not using lang_tok altering, simplified to use the same collater
+                shared_collater=self._shared_collater(),
+            )
+        else:
+            return self.load_into_concat_dataset(split, datasets, data_param_list)
+
+    def load_sampled_multi_dataset(
+        self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs
+    ):
+        datasets, data_param_list = self.load_split_datasets(
+            split, training, epoch, combine, shard_epoch=shard_epoch, **kwargs
+        )
+        if training and split == getattr(self.args, "train_subset", None):
+            sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch)
+            return SampledMultiDataset(
+                OrderedDict(datasets),
+                epoch=epoch,
+                # valid and test datasets will be degerate to concating datasets:
+                sampling_ratios=sample_ratios,
+                eval_key=None,
+                collate_format=CollateFormat.single,
+                virtual_size=self.args.virtual_data_size,
+                split=split,
+                # if not using lang_tok altering, simplified to use the same collater
+                shared_collater=self._shared_collater(),
+            )
+        else:
+            return self.load_into_concat_dataset(split, datasets, data_param_list)
+
+    def load_dataset(
+        self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs
+    ):
+        if self.args.virtual_epoch_size is None:
+            return self.load_sampled_multi_dataset(
+                split, training, epoch, combine, shard_epoch, **kwargs
+            )
+        else:
+            return self.load_sampled_multi_epoch_dataset(
+                split, training, epoch, combine, shard_epoch, **kwargs
+            )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py
new file mode 100644
index 0000000000..b4e0f9828c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py
@@ -0,0 +1,63 @@
+from enum import Enum
+from typing import Dict, List, Optional, Sequence
+
+import torch
+from fairseq.data import Dictionary
+
+
+class EncoderLangtok(Enum):
+    """
+    Prepend to the beginning of source sentence either the
+    source or target language token. (src/tgt).
+    """
+
+    src = "src"
+    tgt = "tgt"
+
+
+class LangTokSpec(Enum):
+    main = "main"
+    mono_dae = "mono_dae"
+
+
+class LangTokStyle(Enum):
+    multilingual = "multilingual"
+    mbart = "mbart"
+
+
+@torch.jit.export
+def get_lang_tok(
+    lang: str, lang_tok_style: str, spec: str = LangTokSpec.main.value
+) -> str:
+    # TOKEN_STYLES can't be defined outside this fn since it needs to be
+    # TorchScriptable.
+    TOKEN_STYLES: Dict[str, str] = {
+        LangTokStyle.mbart.value: "[{}]",
+        LangTokStyle.multilingual.value: "__{}__",
+    }
+
+    if spec.endswith("dae"):
+        lang = f"{lang}_dae"
+    elif spec.endswith("mined"):
+        lang = f"{lang}_mined"
+    style = TOKEN_STYLES[lang_tok_style]
+    return style.format(lang)
+
+
+def augment_dictionary(
+    dictionary: Dictionary,
+    language_list: List[str],
+    lang_tok_style: str,
+    langtoks_specs: Sequence[str] = (LangTokSpec.main.value,),
+    extra_data: Optional[Dict[str, str]] = None,
+) -> None:
+    for spec in langtoks_specs:
+        for language in language_list:
+            dictionary.add_symbol(
+                get_lang_tok(lang=language, lang_tok_style=lang_tok_style, spec=spec)
+            )
+
+    if lang_tok_style == LangTokStyle.mbart.value or (
+        extra_data is not None and LangTokSpec.mono_dae.value in extra_data
+    ):
+        dictionary.add_symbol("<mask>")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
new file mode 100644
index 0000000000..ece9a9721e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
@@ -0,0 +1,468 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import datetime
+import hashlib
+import logging
+import time
+from bisect import bisect_right
+from collections import OrderedDict, defaultdict
+from enum import Enum
+from typing import List
+
+import numpy as np
+import torch
+
+from fairseq.data import FairseqDataset, data_utils
+from fairseq.distributed import utils as distributed_utils
+
+
+def get_time_gap(s, e):
+    return (
+        datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s)
+    ).__str__()
+
+
+logger = logging.getLogger(__name__)
+
+
+def default_virtual_size_func(datasets, ratios, max_scale_up=1.5):
+    sizes = [len(d) for d in datasets]
+    if ratios is None:
+        return sum(sizes)
+    largest_idx = np.argmax(sizes)
+    largest_r = ratios[largest_idx]
+    largest_s = sizes[largest_idx]
+    # set virtual sizes relative to the largest dataset
+    virtual_sizes = [(r / largest_r) * largest_s for r in ratios]
+    vsize = sum(virtual_sizes)
+    max_size = sum(sizes) * max_scale_up
+    return int(vsize if vsize < max_size else max_size)
+
+
+class CollateFormat(Enum):
+    single = 1
+    ordered_dict = 2
+
+
+class SampledMultiDataset(FairseqDataset):
+    """Samples from multiple sub-datasets according to given sampling ratios.
+    Args:
+        datasets (
+            List[~torch.utils.data.Dataset]
+            or OrderedDict[str, ~torch.utils.data.Dataset]
+        ): datasets
+        sampling_ratios (List[float]): list of probability of each dataset to be sampled
+            (default: None, which corresponds to concatenating all dataset together).
+        seed (int): RNG seed to use (default: 2).
+        epoch (int): starting epoch number (default: 1).
+        eval_key (str, optional): a key used at evaluation time that causes
+            this instance to pass-through batches from *datasets[eval_key]*.
+        collate_format (CollateFormat):  collater output format, either CollateFormat.ordered_dict or
+            CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures
+            the collater to output batches of data mixed from all sub-datasets,
+            and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys
+            of sub-datasets.
+            Note that not all sub-datasets will present in a single batch in both formats.
+        virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func).
+        split (str): the split of the data, e.g. 'train', 'valid' or 'test'.
+        shared_collater (bool): whether or not to all sub-datasets have the same collater.
+        shuffle (bool): whether or not to shuffle data (default: True).
+    """
+
+    def __init__(
+        self,
+        datasets,
+        sampling_ratios=None,
+        seed=2,
+        epoch=1,
+        eval_key=None,
+        collate_format=CollateFormat.single,
+        virtual_size=default_virtual_size_func,
+        split="",
+        shared_collater=False,
+        shuffle=True,
+    ):
+        super().__init__()
+        self.shared_collater = shared_collater
+        self.shuffle = shuffle
+
+        if isinstance(datasets, OrderedDict):
+            self.keys = list(datasets.keys())
+            datasets = list(datasets.values())
+        elif isinstance(datasets, List):
+            self.keys = list(range(len(datasets)))
+        else:
+            raise AssertionError()
+        self.datasets = datasets
+        self.split = split
+
+        self.eval_key = eval_key
+        if self.eval_key is not None:
+            self.collate_format = CollateFormat.single
+        else:
+            self.collate_format = collate_format
+
+        self.seed = seed
+        self._cur_epoch = None
+
+        self.cumulated_sizes = None
+        # self.datasets[k][self._cur_indices[i]] is the data item i in this sampled dataset
+        # namely, data item i is sampled from the kth sub-dataset self.datasets[k]
+        # where self.cumulated_sizes[k-1] <= i < self.cumulated_sizes[k]
+        self._cur_indices = None
+
+        self._sizes = None
+        self.virtual_size_per_dataset = None
+        # caching properties
+        self._reset_cached_properties()
+        self.setup_sampling(sampling_ratios, virtual_size)
+        self.set_epoch(epoch)
+
+    def _clean_if_not_none(self, var_list):
+        for v in var_list:
+            if v is not None:
+                del v
+
+    def _reset_cached_properties(self):
+        self._clean_if_not_none([self._sizes, self._cur_indices])
+        self._sizes = None
+        self._cur_indices = None
+
+    def setup_sampling(self, sample_ratios, virtual_size):
+        sizes = [len(d) for d in self.datasets]
+        if sample_ratios is None:
+            # default back to concating datasets
+            self.sample_ratios = None
+            self.virtual_size = sum(sizes)
+        else:
+            if not isinstance(sample_ratios, np.ndarray):
+                sample_ratios = np.array(sample_ratios)
+            self.sample_ratios = sample_ratios
+            virtual_size = (
+                default_virtual_size_func if virtual_size is None else virtual_size
+            )
+            self.virtual_size = (
+                virtual_size(self.datasets, self.sample_ratios)
+                if callable(virtual_size)
+                else virtual_size
+            )
+
+    def adjust_sampling(self, epoch, sampling_ratios, virtual_size):
+        if sampling_ratios is not None:
+            sampling_ratios = self._sync_sample_ratios(sampling_ratios)
+            self.setup_sampling(sampling_ratios, virtual_size)
+
+    def _sync_sample_ratios(self, ratios):
+        # in case the ratios are not precisely the same across processes
+        # also to ensure every procresses update the ratios in the same pace
+        ratios = torch.DoubleTensor(ratios)
+        if torch.distributed.is_initialized():
+            if torch.cuda.is_available():
+                distributed_utils.all_reduce(
+                    ratios.cuda(), group=distributed_utils.get_data_parallel_group()
+                )
+            else:
+                distributed_utils.all_reduce(
+                    ratios, group=distributed_utils.get_data_parallel_group()
+                )
+            ret = ratios.cpu()
+            ret = ret.numpy()
+        return ret
+
+    def random_choice_in_dataset(self, rng, dataset, choice_size):
+        if hasattr(dataset, "random_choice_in_dataset"):
+            return dataset.random_choice_in_dataset(rng, choice_size)
+        dataset_size = len(dataset)
+        return rng.choice(
+            dataset_size, choice_size, replace=(choice_size > dataset_size)
+        )
+
+    def get_virtual_indices(self, rng, datasets, sample_ratios, virtual_size):
+        def get_counts(sample_ratios):
+            counts = np.array([virtual_size * r for r in sample_ratios], dtype=np.int64)
+            diff = virtual_size - counts.sum()
+            assert diff >= 0
+            # due to round-offs, the size might not match the desired sizes
+            if diff > 0:
+                dataset_indices = rng.choice(
+                    len(sample_ratios), size=diff, p=sample_ratios
+                )
+                for i in dataset_indices:
+                    counts[i] += 1
+            return counts
+
+        def get_in_dataset_indices(datasets, sizes, sample_ratios):
+            counts = get_counts(sample_ratios)
+            # uniformally sample desired counts for each dataset
+            # if the desired counts are large, sample with replacement:
+            indices = [
+                self.random_choice_in_dataset(rng, d, c)
+                for c, d in zip(counts, datasets)
+            ]
+            return indices
+
+        sizes = [len(d) for d in datasets]
+        if sample_ratios is None:
+            # default back to concating datasets
+            in_dataset_indices = [list(range(s)) for s in sizes]
+            virtual_sizes_per_dataset = sizes
+        else:
+            ratios = sample_ratios / sample_ratios.sum()
+            in_dataset_indices = get_in_dataset_indices(datasets, sizes, ratios)
+            virtual_sizes_per_dataset = [len(d) for d in in_dataset_indices]
+        virtual_sizes_per_dataset = np.array(virtual_sizes_per_dataset, np.int64)
+        cumulative_sizes = np.cumsum(virtual_sizes_per_dataset)
+        assert sum(virtual_sizes_per_dataset) == virtual_size
+        assert cumulative_sizes[-1] == virtual_size
+        if virtual_size < sum(sizes):
+            logger.warning(
+                f"virtual data size ({virtual_size}) is less than real data size ({sum(sizes)})."
+                " If virtual size << real data size, there could be data coverage issue."
+            )
+        in_dataset_indices = np.hstack(in_dataset_indices)
+        return in_dataset_indices, cumulative_sizes, virtual_sizes_per_dataset
+
+    def _get_dataset_and_index(self, index):
+        i = bisect_right(self.cumulated_sizes, index)
+        return i, self._cur_indices[index]
+
+    def __getitem__(self, index):
+        # self.__getitem__(index) returns self.datasets[k][self._cur_indices[index]]
+        # where k satisfies self.cumulated_sizes[k - 1] <= k < self.cumulated_sizes[k]
+        ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
+        ret = (ds_idx, self.datasets[ds_idx][ds_sample_idx])
+        return ret
+
+    def num_tokens(self, index):
+        return self.sizes[index].max()
+
+    def num_tokens_vec(self, indices):
+        sizes_vec = self.sizes[np.array(indices)]
+        # max across all dimensions but first one
+        return np.amax(sizes_vec, axis=tuple(range(1, len(sizes_vec.shape))))
+
+    def size(self, index):
+        return self.sizes[index]
+
+    def __len__(self):
+        return self.virtual_size
+
+    def collater(self, samples, **extra_args):
+        """Merge a list of samples to form a mini-batch."""
+        if len(samples) == 0:
+            return None
+        if self.collate_format == "ordered_dict":
+            collect_samples = [[] for _ in range(len(self.datasets))]
+            for (i, sample) in samples:
+                collect_samples[i].append(sample)
+            batch = OrderedDict(
+                [
+                    (self.keys[i], dataset.collater(collect_samples[i]))
+                    for i, (key, dataset) in enumerate(zip(self.keys, self.datasets))
+                    if len(collect_samples[i]) > 0
+                ]
+            )
+        elif self.shared_collater:
+            batch = self.datasets[0].collater([s for _, s in samples])
+        else:
+            samples_dict = defaultdict(list)
+            pad_to_length = (
+                defaultdict(int)
+                if "pad_to_length" not in extra_args
+                else extra_args["pad_to_length"]
+            )
+            for ds_idx, s in samples:
+                pad_to_length["source"] = max(
+                    pad_to_length["source"], s["source"].size(0)
+                )
+                if s["target"] is not None:
+                    pad_to_length["target"] = max(
+                        pad_to_length["target"], s["target"].size(0)
+                    )
+                samples_dict[ds_idx].append(s)
+            batches = [
+                self.datasets[i].collater(samples_dict[i], pad_to_length=pad_to_length)
+                for i in range(len(self.datasets))
+                if len(samples_dict[i]) > 0
+            ]
+
+            def straight_data(tensors):
+                batch = torch.cat(tensors, dim=0)
+                return batch
+
+            src_lengths = straight_data(
+                [b["net_input"]["src_lengths"] for b in batches]
+            )
+            src_lengths, sort_order = src_lengths.sort(descending=True)
+
+            def straight_order(tensors):
+                batch = straight_data(tensors)
+                return batch.index_select(0, sort_order)
+
+            batch = {
+                "id": straight_order([b["id"] for b in batches]),
+                "nsentences": sum(b["nsentences"] for b in batches),
+                "ntokens": sum(b["ntokens"] for b in batches),
+                "net_input": {
+                    "src_tokens": straight_order(
+                        [b["net_input"]["src_tokens"] for b in batches]
+                    ),
+                    "src_lengths": src_lengths,
+                },
+                "target": straight_order([b["target"] for b in batches])
+                if batches[0]["target"] is not None
+                else None,
+            }
+            if "prev_output_tokens" in batches[0]["net_input"]:
+                batch["net_input"]["prev_output_tokens"] = straight_order(
+                    [b["net_input"]["prev_output_tokens"] for b in batches]
+                )
+            if "src_lang_id" in batches[0]["net_input"]:
+                batch["net_input"]["src_lang_id"] = straight_order(
+                    [b["net_input"]["src_lang_id"] for b in batches]
+                )
+            if "tgt_lang_id" in batches[0]:
+                batch["tgt_lang_id"] = straight_order(
+                    [b["tgt_lang_id"] for b in batches]
+                )
+        return batch
+
+    @property
+    def sizes(self):
+        if self._sizes is not None:
+            return self._sizes
+        start_time = time.time()
+        in_sub_dataset_indices = [
+            self._cur_indices[
+                0 if i == 0 else self.cumulated_sizes[i - 1] : self.cumulated_sizes[i]
+            ]
+            for i in range(len(self.datasets))
+        ]
+        sub_dataset_sizes = [
+            d.sizes[indices]
+            for d, indices in zip(self.datasets, in_sub_dataset_indices)
+        ]
+        self._sizes = np.vstack(sub_dataset_sizes)
+        logger.info(f"sizes() calling time: {get_time_gap(start_time, time.time())}")
+        return self._sizes
+
+    def ordered_indices(self):
+        if self.shuffle:
+            indices = np.random.permutation(len(self))
+        else:
+            indices = np.arange(len(self))
+
+        sizes = self.sizes
+        tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
+        src_sizes = (
+            sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
+        )
+
+        # sort by target length, then source length
+        if tgt_sizes is not None:
+            indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")]
+        sort_indices = indices[np.argsort(src_sizes[indices], kind="mergesort")]
+        return sort_indices
+
+    def prefetch(self, indices):
+        prefetch_indices = [[] for _ in range(len(self.datasets))]
+        for i in indices:
+            ds_idx, ds_sample_idx = self._get_dataset_and_index(i)
+            prefetch_indices[ds_idx].append(ds_sample_idx)
+        for i in range(len(prefetch_indices)):
+            self.datasets[i].prefetch(prefetch_indices[i])
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return False
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        if epoch == self._cur_epoch:
+            # re-enter so return
+            return
+        for d in self.datasets:
+            if hasattr(d, "set_epoch"):
+                d.set_epoch(epoch)
+        self._cur_epoch = epoch
+        self._establish_virtual_datasets()
+
+    def _establish_virtual_datasets(self):
+        if self.sample_ratios is None and self._cur_indices is not None:
+            # not a samping dataset, no need to resample if indices are already established
+            return
+        self._reset_cached_properties()
+
+        start_time = time.time()
+        # Generate a weighted sample of indices as a function of the
+        # random seed and the current epoch.
+        rng = np.random.RandomState(
+            [
+                int(
+                    hashlib.sha1(
+                        str(self.__class__.__name__).encode("utf-8")
+                    ).hexdigest(),
+                    16,
+                )
+                % (2**32),
+                self.seed % (2**32),  # global seed
+                self._cur_epoch,  # epoch index,
+            ]
+        )
+        self._clean_if_not_none(
+            [self.cumulated_sizes, self.virtual_size_per_dataset, self._sizes]
+        )
+        self._sizes = None
+
+        indices, cumulated_sizes, virtual_size_per_dataset = self.get_virtual_indices(
+            rng, self.datasets, self.sample_ratios, self.virtual_size
+        )
+        self._cur_indices = indices
+        self.cumulated_sizes = cumulated_sizes
+        self.virtual_size_per_dataset = virtual_size_per_dataset
+
+        raw_sizes = [len(d) for d in self.datasets]
+        sampled_sizes = self.virtual_size_per_dataset
+        logger.info(
+            f"[{self.split}] Raw sizes: {str(dict(zip(self.keys, raw_sizes)))}; "
+            f"raw total size: {sum(raw_sizes)}"
+        )
+        logger.info(
+            f"[{self.split}] Resampled sizes: {str(dict(zip(self.keys, sampled_sizes)))}; "
+            f"resampled total size: {sum(sampled_sizes)}"
+        )
+        if self.sample_ratios is not None:
+            logger.info(
+                f"[{self.split}] Upsampling ratios: {str(dict(zip(self.keys, self.sample_ratios)))}"
+            )
+        else:
+            logger.info(f"[{self.split}] A concat dataset")
+        logger.info(
+            f"[{self.split}] virtual dataset established time: {get_time_gap(start_time, time.time())}"
+        )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        """Filter a list of sample indices. Remove those that are longer
+            than specified in max_sizes.
+
+        Args:
+            indices (np.array): original array of sample indices
+            max_sizes (int or list[int] or tuple[int]): max sample size,
+                can be defined separately for src and tgt (then list or tuple)
+
+        Returns:
+            np.array: filtered sample array
+            list: list of removed indices
+        """
+        sizes = self.sizes
+        tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
+        src_sizes = (
+            sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
+        )
+
+        return data_utils.filter_paired_dataset_indices_by_size(
+            src_sizes, tgt_sizes, indices, max_sizes
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
new file mode 100644
index 0000000000..bb187a8dc2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import hashlib
+import logging
+import math
+
+import numpy as np
+
+from fairseq.data import SampledMultiDataset
+
+from .sampled_multi_dataset import CollateFormat, default_virtual_size_func
+
+logger = logging.getLogger(__name__)
+
+
+class SampledMultiEpochDataset(SampledMultiDataset):
+    """Samples from multiple sub-datasets according to sampling ratios
+       using virtual epoch sizes to speed up dataloading.
+    Args:
+        datasets (
+            List[~torch.utils.data.Dataset]
+            or OrderedDict[str, ~torch.utils.data.Dataset]
+        ): datasets
+        sampling_ratios (List[float]): list of probability of each dataset to be sampled
+            (default: None, which corresponds to concating all dataset together).
+        seed (int): RNG seed to use (default: 2).
+        epoch (int): starting epoch number (default: 1).
+        eval_key (str, optional): a key used at evaluation time that causes
+            this instance to pass-through batches from *datasets[eval_key]*.
+        collate_format (CollateFormat):  collater output format, either CollateFormat.ordered_dict or
+            CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures
+            the collater to output batches of data mixed from all sub-datasets,
+            and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys
+            of sub-datasets.
+            Note that not all sub-datasets will present in a single batch in both formats.
+        virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func).
+        split (str): the split of the data, e.g. 'train', 'valid' or 'test'.
+        virtual_epoch_size (int): virtual epoch size, the dataset will go through the data by
+            this virtual epoch size one by one to speed up data loading, e.g. indicing and filtering
+            can be performed whenever a virtual epoch is loaded without waiting for the whole dataset to be loaded.
+        shared_collater (bool): whether or not to all sub-datasets have the same collater.
+        shard_epoch (int): the real epoch number for shard selection.
+        shuffle (bool): whether or not to shuffle data (default: True).
+    """
+
+    def __init__(
+        self,
+        datasets,
+        sampling_ratios=None,
+        seed=2,
+        epoch=1,
+        eval_key=None,
+        collate_format=CollateFormat.single,
+        virtual_size=default_virtual_size_func,
+        split="",
+        virtual_epoch_size=None,
+        shared_collater=False,
+        shard_epoch=1,
+        shuffle=True,
+    ):
+        self.virtual_epoch_size = virtual_epoch_size
+        self._current_epoch_start_index = None
+        self._random_global_indices = None
+        self.shard_epoch = shard_epoch if shard_epoch is not None else 1
+        self.load_next_shard = None
+        self._epoch_sizes = None
+        super().__init__(
+            datasets=datasets,
+            sampling_ratios=sampling_ratios,
+            seed=seed,
+            epoch=epoch,
+            eval_key=eval_key,
+            collate_format=collate_format,
+            virtual_size=virtual_size,
+            split=split,
+            shared_collater=shared_collater,
+            shuffle=shuffle,
+        )
+
+    def _setup(self, epoch):
+        self.virtual_epoch_size = (
+            self.virtual_epoch_size
+            if self.virtual_epoch_size is not None
+            else self.virtual_size
+        )
+        if self.virtual_epoch_size > self.virtual_size:
+            logger.warning(
+                f"virtual epoch size {self.virtual_epoch_size} "
+                f"is greater than virtual dataset size {self.virtual_size}"
+            )
+            self.virtual_epoch_size = self.virtual_size
+        self.num_virtual_epochs = math.ceil(self.virtual_size / self.virtual_epoch_size)
+        self._current_epoch_start_index = self._get_epoch_start_index(epoch)
+        logger.info(
+            f"virtual epoch size {self.virtual_epoch_size}; virtual dataset size {self.virtual_size}"
+        )
+
+    def _map_epoch_index_to_global(self, index):
+        index = self._current_epoch_start_index + index
+        # add randomness
+        return self._random_global_indices[index]
+
+    @property
+    def sizes(self):
+        if self._epoch_sizes is not None:
+            return self._epoch_sizes
+        _sizes = super().sizes
+        indices = self._random_global_indices[
+            self._current_epoch_start_index : self._current_epoch_start_index
+            + len(self)
+        ]
+        self._epoch_sizes = _sizes[indices]
+        # del super()._sizes to save memory
+        del self._sizes
+        self._sizes = None
+        return self._epoch_sizes
+
+    def _get_dataset_and_index(self, index):
+        i = self._map_epoch_index_to_global(index)
+        return super()._get_dataset_and_index(i)
+
+    def __len__(self):
+        return (
+            self.virtual_epoch_size
+            if self._current_epoch_start_index + self.virtual_epoch_size
+            < self.virtual_size
+            else self.virtual_size - self._current_epoch_start_index
+        )
+
+    def set_epoch(self, epoch):
+        if self._current_epoch_start_index is None:
+            # initializing epoch idnices of a virtual dataset
+            self._setup(epoch)
+            self._next_virtual_epoch(epoch)
+        else:
+            # working on already intialized epoch indices
+            if epoch == self._cur_epoch:
+                # re-enter so return
+                return
+            self._next_virtual_epoch(epoch)
+
+    def _get_epoch_start_index(self, epoch):
+        assert epoch >= 1  # fairseq is using 1-based epoch everywhere
+        return ((epoch - 1) % self.num_virtual_epochs) * self.virtual_epoch_size
+
+    def _next_global_indices(self, epoch):
+        rng = np.random.RandomState(
+            [
+                int(
+                    hashlib.sha1(
+                        str(self.__class__.__name__).encode("utf-8")
+                    ).hexdigest(),
+                    16,
+                )
+                % (2**32),
+                self.seed % (2**32),  # global seed
+                epoch,  # epoch index,
+            ]
+        )
+        del self._random_global_indices
+        self._random_global_indices = rng.choice(
+            self.virtual_size, self.virtual_size, replace=False
+        )
+        if self.load_next_shard is None:
+            self.load_next_shard = False
+        else:
+            # increase shard epoch for next loading
+            self.shard_epoch += 1
+            self.load_next_shard = True
+            logger.info(
+                "to load next epoch/shard in next load_dataset: "
+                f"epoch={epoch}/shard_epoch={self.shard_epoch}"
+            )
+
+    def _next_virtual_epoch(self, epoch):
+        index = self._get_epoch_start_index(epoch)
+        if index == 0 or self._random_global_indices is None:
+            # need to start from the beginning,
+            # so call super().set_epoch(epoch) to establish the global virtual indices
+            logger.info(
+                "establishing a new set of global virtual indices for "
+                f"epoch={epoch}/shard_epoch={self.shard_epoch}"
+            )
+            super().set_epoch(epoch)
+            self._next_global_indices(epoch)
+        else:
+            self._cur_epoch = epoch
+
+        # reset cache sizes and ordered_indices for the epoch after moving to a new epoch
+        self._clean_if_not_none(
+            [
+                self._epoch_sizes,
+            ]
+        )
+        self._epoch_sizes = None
+        self._current_epoch_start_index = index
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
new file mode 100644
index 0000000000..140c68f01d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List
+
+
+logger = logging.getLogger(__name__)
+
+
+def uniform(dataset_sizes: List[int]):
+    return [1.0] * len(dataset_sizes)
+
+
+def temperature_sampling(dataset_sizes, temp):
+    total_size = sum(dataset_sizes)
+    return [(size / total_size) ** (1.0 / temp) for size in dataset_sizes]
+
+
+def make_temperature_sampling(temp=1.0):
+    def sampling_func(dataset_sizes):
+        return temperature_sampling(dataset_sizes, temp)
+
+    return sampling_func
+
+
+def make_ratio_sampling(ratios):
+    def sampling_func(dataset_sizes):
+        return ratios
+
+    return sampling_func
+
+
+class SamplingMethod:
+    @staticmethod
+    def add_arguments(parser):
+        parser.add_argument(
+            "--sampling-method",
+            choices=[
+                "uniform",
+                "temperature",
+                "concat",
+                "RoundRobin",
+            ],
+            type=str,
+            default="concat",
+            help="The method to sample data per language pairs",
+        )
+        parser.add_argument(
+            "--sampling-temperature",
+            default=1.5,
+            type=float,
+            help="only work with --sampling-method temperature",
+        )
+
+    @staticmethod
+    def build_sampler(args, task):
+        return SamplingMethod(args, task)
+
+    def __init__(self, args, task):
+        self.args = args
+        self.task = task
+
+    def is_adaptive(self):
+        return False
+
+    def sampling_method_selector(self):
+        args = self.args
+        logger.info(f"selected sampler: {args.sampling_method}")
+        if args.sampling_method == "uniform":
+            return uniform
+        elif args.sampling_method == "temperature" or self.is_adaptive():
+            return make_temperature_sampling(float(args.sampling_temperature))
+        else:
+            # default to concating all data set together
+            return None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
new file mode 100644
index 0000000000..52e74abdda
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+import torch
+from torch.utils.data.dataloader import default_collate
+
+from . import FairseqDataset
+
+
+def _flatten(dico, prefix=None):
+    """Flatten a nested dictionary."""
+    new_dico = OrderedDict()
+    if isinstance(dico, dict):
+        prefix = prefix + "." if prefix is not None else ""
+        for k, v in dico.items():
+            if v is None:
+                continue
+            new_dico.update(_flatten(v, prefix + k))
+    elif isinstance(dico, list):
+        for i, v in enumerate(dico):
+            new_dico.update(_flatten(v, prefix + ".[" + str(i) + "]"))
+    else:
+        new_dico = OrderedDict({prefix: dico})
+    return new_dico
+
+
+def _unflatten(dico):
+    """Unflatten a flattened dictionary into a nested dictionary."""
+    new_dico = OrderedDict()
+    for full_k, v in dico.items():
+        full_k = full_k.split(".")
+        node = new_dico
+        for k in full_k[:-1]:
+            if k.startswith("[") and k.endswith("]"):
+                k = int(k[1:-1])
+            if k not in node:
+                node[k] = OrderedDict()
+            node = node[k]
+        node[full_k[-1]] = v
+    return new_dico
+
+
+class NestedDictionaryDataset(FairseqDataset):
+    def __init__(self, defn, sizes=None):
+        super().__init__()
+        self.defn = _flatten(defn)
+        self.sizes = [sizes] if not isinstance(sizes, (list, tuple)) else sizes
+
+        first = None
+        for v in self.defn.values():
+            if not isinstance(
+                v,
+                (
+                    FairseqDataset,
+                    torch.utils.data.Dataset,
+                ),
+            ):
+                raise ValueError("Expected Dataset but found: {}".format(v.__class__))
+            first = first or v
+            if len(v) > 0:
+                assert len(v) == len(first), "dataset lengths must match"
+
+        self._len = len(first)
+
+    def __getitem__(self, index):
+        return OrderedDict((k, ds[index]) for k, ds in self.defn.items())
+
+    def __len__(self):
+        return self._len
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        if len(samples) == 0:
+            return {}
+        sample = OrderedDict()
+        for k, ds in self.defn.items():
+            try:
+                sample[k] = ds.collater([s[k] for s in samples])
+            except NotImplementedError:
+                sample[k] = default_collate([s[k] for s in samples])
+        return _unflatten(sample)
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return max(s[index] for s in self.sizes)
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        if len(self.sizes) == 1:
+            return self.sizes[0][index]
+        else:
+            return (s[index] for s in self.sizes)
+
+    @property
+    def supports_prefetch(self):
+        """Whether this dataset supports prefetching."""
+        return any(ds.supports_prefetch for ds in self.defn.values())
+
+    def prefetch(self, indices):
+        """Prefetch the data required for this epoch."""
+        for ds in self.defn.values():
+            if getattr(ds, "supports_prefetch", False):
+                ds.prefetch(indices)
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return all(ds.can_reuse_epoch_itr_across_epochs for ds in self.defn.values())
+
+    def set_epoch(self, epoch):
+        super().set_epoch(epoch)
+        for ds in self.defn.values():
+            ds.set_epoch(epoch)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
new file mode 100644
index 0000000000..e92e83c2cd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
@@ -0,0 +1,334 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from fairseq.data import data_utils
+
+
+class WordNoising(object):
+    """Generate a noisy version of a sentence, without changing words themselves."""
+
+    def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
+        self.dictionary = dictionary
+        self.bpe_end = None
+        if bpe_cont_marker:
+            self.bpe_end = np.array(
+                [
+                    not self.dictionary[i].endswith(bpe_cont_marker)
+                    for i in range(len(self.dictionary))
+                ]
+            )
+        elif bpe_end_marker:
+            self.bpe_end = np.array(
+                [
+                    self.dictionary[i].endswith(bpe_end_marker)
+                    for i in range(len(self.dictionary))
+                ]
+            )
+
+        self.get_word_idx = (
+            self._get_bpe_word_idx if self.bpe_end is not None else self._get_token_idx
+        )
+
+    def noising(self, x, lengths, noising_prob=0.0):
+        raise NotImplementedError()
+
+    def _get_bpe_word_idx(self, x):
+        """
+        Given a list of BPE tokens, for every index in the tokens list,
+        return the index of the word grouping that it belongs to.
+        For example, for input x corresponding to ["how", "are", "y@@", "ou"],
+        return [[0], [1], [2], [2]].
+        """
+        # x: (T x B)
+        bpe_end = self.bpe_end[x]
+
+        if x.size(0) == 1 and x.size(1) == 1:
+            # Special case when we only have one word in x. If x = [[N]],
+            # bpe_end is a scalar (bool) instead of a 2-dim array of bools,
+            # which makes the sum operation below fail.
+            return np.array([[0]])
+
+        # do a reduce front sum to generate word ids
+        word_idx = bpe_end[::-1].cumsum(0)[::-1]
+        word_idx = word_idx.max(0)[None, :] - word_idx
+        return word_idx
+
+    def _get_token_idx(self, x):
+        """
+        This is to extend noising functions to be able to apply to non-bpe
+        tokens, e.g. word or characters.
+        """
+        x = torch.t(x)
+        word_idx = np.array([range(len(x_i)) for x_i in x])
+        return np.transpose(word_idx)
+
+
+class WordDropout(WordNoising):
+    """Randomly drop input words. If not passing blank_idx (default is None),
+    then dropped words will be removed. Otherwise, it will be replaced by the
+    blank_idx."""
+
+    def __init__(
+        self,
+        dictionary,
+        default_dropout_prob=0.1,
+        bpe_cont_marker="@@",
+        bpe_end_marker=None,
+    ):
+        super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
+        self.default_dropout_prob = default_dropout_prob
+
+    def noising(self, x, lengths, dropout_prob=None, blank_idx=None):
+        if dropout_prob is None:
+            dropout_prob = self.default_dropout_prob
+        # x: (T x B), lengths: B
+        if dropout_prob == 0:
+            return x, lengths
+
+        assert 0 < dropout_prob < 1
+
+        # be sure to drop entire words
+        word_idx = self.get_word_idx(x)
+        sentences = []
+        modified_lengths = []
+        for i in range(lengths.size(0)):
+            # Since dropout probabilities need to apply over non-pad tokens,
+            # it is not trivial to generate the keep mask without consider
+            # input lengths; otherwise, this could be done outside the loop
+
+            # We want to drop whole words based on word_idx grouping
+            num_words = max(word_idx[:, i]) + 1
+
+            # ith example: [x0, x1, ..., eos, pad, ..., pad]
+            # We should only generate keep probs for non-EOS tokens. Thus if the
+            # input sentence ends in EOS, the last word idx is not included in
+            # the dropout mask generation and we append True to always keep EOS.
+            # Otherwise, just generate the dropout mask for all word idx
+            # positions.
+            has_eos = x[lengths[i] - 1, i] == self.dictionary.eos()
+            if has_eos:  # has eos?
+                keep = np.random.rand(num_words - 1) >= dropout_prob
+                keep = np.append(keep, [True])  # keep EOS symbol
+            else:
+                keep = np.random.rand(num_words) >= dropout_prob
+
+            words = x[: lengths[i], i].tolist()
+
+            # TODO: speed up the following loop
+            # drop words from the input according to keep
+            new_s = [
+                w if keep[word_idx[j, i]] else blank_idx for j, w in enumerate(words)
+            ]
+            new_s = [w for w in new_s if w is not None]
+            # we need to have at least one word in the sentence (more than the
+            # start / end sentence symbols)
+            if len(new_s) <= 1:
+                # insert at beginning in case the only token left is EOS
+                # EOS should be at end of list.
+                new_s.insert(0, words[np.random.randint(0, len(words))])
+            assert len(new_s) >= 1 and (
+                not has_eos  # Either don't have EOS at end or last token is EOS
+                or (len(new_s) >= 2 and new_s[-1] == self.dictionary.eos())
+            ), "New sentence is invalid."
+            sentences.append(new_s)
+            modified_lengths.append(len(new_s))
+        # re-construct input
+        modified_lengths = torch.LongTensor(modified_lengths)
+        modified_x = torch.LongTensor(
+            modified_lengths.max(), modified_lengths.size(0)
+        ).fill_(self.dictionary.pad())
+        for i in range(modified_lengths.size(0)):
+            modified_x[: modified_lengths[i], i].copy_(torch.LongTensor(sentences[i]))
+
+        return modified_x, modified_lengths
+
+
+class WordShuffle(WordNoising):
+    """Shuffle words by no more than k positions."""
+
+    def __init__(
+        self,
+        dictionary,
+        default_max_shuffle_distance=3,
+        bpe_cont_marker="@@",
+        bpe_end_marker=None,
+    ):
+        super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
+        self.default_max_shuffle_distance = 3
+
+    def noising(self, x, lengths, max_shuffle_distance=None):
+        if max_shuffle_distance is None:
+            max_shuffle_distance = self.default_max_shuffle_distance
+        # x: (T x B), lengths: B
+        if max_shuffle_distance == 0:
+            return x, lengths
+
+        # max_shuffle_distance < 1 will return the same sequence
+        assert max_shuffle_distance > 1
+
+        # define noise word scores
+        noise = np.random.uniform(
+            0,
+            max_shuffle_distance,
+            size=(x.size(0), x.size(1)),
+        )
+        noise[0] = -1  # do not move start sentence symbol
+        # be sure to shuffle entire words
+        word_idx = self.get_word_idx(x)
+        x2 = x.clone()
+        for i in range(lengths.size(0)):
+            length_no_eos = lengths[i]
+            if x[lengths[i] - 1, i] == self.dictionary.eos():
+                length_no_eos = lengths[i] - 1
+            # generate a random permutation
+            scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i]
+            # ensure no reordering inside a word
+            scores += 1e-6 * np.arange(length_no_eos.item())
+            permutation = scores.argsort()
+            # shuffle words
+            x2[:length_no_eos, i].copy_(
+                x2[:length_no_eos, i][torch.from_numpy(permutation)]
+            )
+        return x2, lengths
+
+
+class UnsupervisedMTNoising(WordNoising):
+    """
+    Implements the default configuration for noising in UnsupervisedMT
+    (github.com/facebookresearch/UnsupervisedMT)
+    """
+
+    def __init__(
+        self,
+        dictionary,
+        max_word_shuffle_distance,
+        word_dropout_prob,
+        word_blanking_prob,
+        bpe_cont_marker="@@",
+        bpe_end_marker=None,
+    ):
+        super().__init__(dictionary)
+        self.max_word_shuffle_distance = max_word_shuffle_distance
+        self.word_dropout_prob = word_dropout_prob
+        self.word_blanking_prob = word_blanking_prob
+
+        self.word_dropout = WordDropout(
+            dictionary=dictionary,
+            bpe_cont_marker=bpe_cont_marker,
+            bpe_end_marker=bpe_end_marker,
+        )
+        self.word_shuffle = WordShuffle(
+            dictionary=dictionary,
+            bpe_cont_marker=bpe_cont_marker,
+            bpe_end_marker=bpe_end_marker,
+        )
+
+    def noising(self, x, lengths):
+        # 1. Word Shuffle
+        noisy_src_tokens, noisy_src_lengths = self.word_shuffle.noising(
+            x=x,
+            lengths=lengths,
+            max_shuffle_distance=self.max_word_shuffle_distance,
+        )
+        # 2. Word Dropout
+        noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
+            x=noisy_src_tokens,
+            lengths=noisy_src_lengths,
+            dropout_prob=self.word_dropout_prob,
+        )
+        # 3. Word Blanking
+        noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
+            x=noisy_src_tokens,
+            lengths=noisy_src_lengths,
+            dropout_prob=self.word_blanking_prob,
+            blank_idx=self.dictionary.unk(),
+        )
+
+        return noisy_src_tokens
+
+
+class NoisingDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        src_dataset,
+        src_dict,
+        seed,
+        noiser=None,
+        noising_class=UnsupervisedMTNoising,
+        **kwargs
+    ):
+        """
+        Wrap a :class:`~torch.utils.data.Dataset` and apply noise to the
+        samples based on the supplied noising configuration.
+
+        Args:
+            src_dataset (~torch.utils.data.Dataset): dataset to wrap.
+                to build self.src_dataset --
+                a LanguagePairDataset with src dataset as the source dataset and
+                None as the target dataset. Should NOT have padding so that
+                src_lengths are accurately calculated by language_pair_dataset
+                collate function.
+                We use language_pair_dataset here to encapsulate the tgt_dataset
+                so we can re-use the LanguagePairDataset collater to format the
+                batches in the structure that SequenceGenerator expects.
+            src_dict (~fairseq.data.Dictionary): source dictionary
+            seed (int): seed to use when generating random noise
+            noiser (WordNoising): a pre-initialized :class:`WordNoising`
+                instance. If this is None, a new instance will be created using
+                *noising_class* and *kwargs*.
+            noising_class (class, optional): class to use to initialize a
+                default :class:`WordNoising` instance.
+            kwargs (dict, optional): arguments to initialize the default
+                :class:`WordNoising` instance given by *noiser*.
+        """
+        self.src_dataset = src_dataset
+        self.src_dict = src_dict
+        self.seed = seed
+        self.noiser = (
+            noiser
+            if noiser is not None
+            else noising_class(
+                dictionary=src_dict,
+                **kwargs,
+            )
+        )
+        self.sizes = src_dataset.sizes
+
+    def __getitem__(self, index):
+        """
+        Returns a single noisy sample. Multiple samples are fed to the collater
+        create a noising dataset batch.
+        """
+        src_tokens = self.src_dataset[index]
+        src_lengths = torch.LongTensor([len(src_tokens)])
+        src_tokens = src_tokens.unsqueeze(0)
+
+        # Transpose src tokens to fit expected shape of x in noising function
+        # (batch size, sequence length) -> (sequence length, batch size)
+        src_tokens_t = torch.t(src_tokens)
+
+        with data_utils.numpy_seed(self.seed + index):
+            noisy_src_tokens = self.noiser.noising(src_tokens_t, src_lengths)
+
+        # Transpose back to expected src_tokens format
+        # (sequence length, 1) -> (1, sequence length)
+        noisy_src_tokens = torch.t(noisy_src_tokens)
+        return noisy_src_tokens[0]
+
+    def __len__(self):
+        """
+        The length of the noising dataset is the length of src.
+        """
+        return len(self.src_dataset)
+
+    @property
+    def supports_prefetch(self):
+        return self.src_dataset.supports_prefetch
+
+    def prefetch(self, indices):
+        if self.src_dataset.supports_prefetch:
+            self.src_dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
new file mode 100644
index 0000000000..99a17495c7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import FairseqDataset
+
+
+class NumSamplesDataset(FairseqDataset):
+    def __getitem__(self, index):
+        return 1
+
+    def __len__(self):
+        return 0
+
+    def collater(self, samples):
+        return sum(samples)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
new file mode 100644
index 0000000000..ac86dfd2f1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class NumelDataset(BaseWrapperDataset):
+    def __init__(self, dataset, reduce=False):
+        super().__init__(dataset)
+        self.reduce = reduce
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        if torch.is_tensor(item):
+            return torch.numel(item)
+        else:
+            return np.size(item)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        if self.reduce:
+            return sum(samples)
+        else:
+            return torch.tensor(samples)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
new file mode 100644
index 0000000000..6fabbdcdaa
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class OffsetTokensDataset(BaseWrapperDataset):
+    def __init__(self, dataset, offset):
+        super().__init__(dataset)
+        self.offset = offset
+
+    def __getitem__(self, idx):
+        return self.dataset[idx] + self.offset
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
new file mode 100644
index 0000000000..b512d370f9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.data import data_utils
+
+from . import BaseWrapperDataset
+
+
+class PadDataset(BaseWrapperDataset):
+    def __init__(self, dataset, pad_idx, left_pad, pad_length=None):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+        self.pad_length = pad_length
+
+    def collater(self, samples):
+        return data_utils.collate_tokens(
+            samples, self.pad_idx, left_pad=self.left_pad, pad_to_length=self.pad_length
+        )
+
+
+class LeftPadDataset(PadDataset):
+    def __init__(self, dataset, pad_idx):
+        super().__init__(dataset, pad_idx, left_pad=True)
+
+
+class RightPadDataset(PadDataset):
+    def __init__(self, dataset, pad_idx):
+        super().__init__(dataset, pad_idx, left_pad=False)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
new file mode 100644
index 0000000000..d7f7b88dbb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from fairseq.data import data_utils
+from . import BaseWrapperDataset
+
+
+class PaddingMaskDataset(BaseWrapperDataset):
+    def __init__(self, dataset, left_pad, pad_length=None):
+        super().__init__(dataset)
+        self.left_pad = left_pad
+        self.pad_length = pad_length
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        return torch.zeros_like(item).bool()
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        return data_utils.collate_tokens(
+            samples, True, left_pad=self.left_pad, pad_to_length=self.pad_length
+        )
+
+
+class LeftPaddingMaskDataset(PaddingMaskDataset):
+    def __init__(self, dataset):
+        super().__init__(dataset, left_pad=True)
+
+
+class RightPaddingMaskDataset(PaddingMaskDataset):
+    def __init__(self, dataset):
+        super().__init__(dataset, left_pad=False)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
new file mode 100644
index 0000000000..459fb8acd7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
@@ -0,0 +1,197 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import hashlib
+import json
+import subprocess
+import tempfile
+from typing import Hashable
+
+try:
+    import pyarrow.plasma as plasma
+
+    PYARROW_AVAILABLE = True
+except ImportError:
+    plasma = None
+    PYARROW_AVAILABLE = False
+
+
+class PlasmaArray:
+    """
+    Wrapper around numpy arrays that automatically moves the data to shared
+    memory upon serialization. This is particularly helpful when passing numpy
+    arrays through multiprocessing, so that data is not unnecessarily
+    duplicated or pickled.
+    """
+
+    def __init__(self, array):
+        super().__init__()
+        self.array = array
+        self.disable = array.nbytes < 134217728  # disable for arrays <128MB
+        self.object_id = None
+        self.path = None
+
+        # variables with underscores shouldn't be pickled
+        self._client = None
+        self._server = None
+        self._server_tmp = None
+        self._plasma = None
+
+    @property
+    def plasma(self):
+        if self._plasma is None and not self.disable:
+            self._plasma = plasma
+        return self._plasma
+
+    def start_server(self):
+        if self.plasma is None or self._server is not None:
+            return
+        assert self.object_id is None
+        assert self.path is None
+        self._server_tmp = tempfile.NamedTemporaryFile()
+        self.path = self._server_tmp.name
+        self._server = subprocess.Popen(
+            ["plasma_store", "-m", str(int(1.05 * self.array.nbytes)), "-s", self.path]
+        )
+
+    @property
+    def client(self):
+        if self._client is None:
+            assert self.path is not None
+            self._client = self.plasma.connect(self.path, num_retries=200)
+        return self._client
+
+    def __getstate__(self):
+        """Called on pickle load"""
+        if self.plasma is None:
+            return self.__dict__
+        if self.object_id is None:
+            self.start_server()
+            self.object_id = self.client.put(self.array)
+        state = self.__dict__.copy()
+        del state["array"]
+        state["_client"] = None
+        state["_server"] = None
+        state["_server_tmp"] = None
+        state["_plasma"] = None
+        return state
+
+    def __setstate__(self, state):
+        """Called on pickle save"""
+        self.__dict__.update(state)
+        if self.plasma is None:
+            return
+        self.array = self.client.get(self.object_id)
+
+    def __del__(self):
+        if self._server is not None:
+            self._server.kill()
+            self._server = None
+            self._server_tmp.close()
+            self._server_tmp = None
+
+
+DEFAULT_PLASMA_PATH = "/tmp/plasma"
+
+
+class PlasmaView:
+    """Interface to write and read from shared memory. Whereas PlasmaArray writes to plasma on serialization,
+    PlasmaView writes to shared memory on instantiation."""
+
+    def __init__(self, array, split_path: str, hash_data: Hashable, plasma_path=None):
+        """
+        Args:
+            array: numpy array to store. This can be read with ``PlasmaView().array``
+            split_path: the path whence the data was read, used for hashing
+            hash_data: other metadata about the array that can be used to create a unique key.
+                as of writing, the 3 callers in ``TokenBlockDataset`` use::
+
+                    hash_data = ((block_size, document_sep_len, str(break_mode), len(dataset)), 0|1|2)
+
+
+        """
+        assert PYARROW_AVAILABLE
+        assert split_path is not None
+        if plasma_path is None:
+            plasma_path = DEFAULT_PLASMA_PATH
+
+        self.path = plasma_path
+        self.split_path = split_path
+        self._client = None  # Initialize lazily for pickle. plasma clients should not be deep copied or serialized.
+        self._n = None
+
+        self.object_id = self.get_object_id(self.split_path, hash_data)
+        try:
+            self.client.put(array, object_id=self.object_id)
+        except plasma.PlasmaObjectExists:
+            pass
+
+    @property
+    def client(self):
+        if self._client is None:
+            self._client = plasma.connect(self.path, num_retries=200)
+        return self._client
+
+    @property
+    def array(self):
+        """Fetch a read only view of an np.array, stored in plasma."""
+        ret = self.client.get(self.object_id)
+        return ret
+
+    @staticmethod
+    def get_object_id(split_path: str, hash_data: Hashable):
+        """Returns plasma.ObjectID from hashing split_path and object_num."""
+        hash = hashlib.blake2b(bytes(split_path, "utf-8"), digest_size=20)
+        harg = json.dumps(hash_data).encode("utf-8")
+        hash.update(harg)
+        return plasma.ObjectID(hash.digest())
+
+    def __getstate__(self):
+        """Called on pickle save"""
+        self.disconnect()
+        state = self.__dict__.copy()
+        assert state["_client"] is None
+        assert "object_id" in state
+        return state
+
+    def __setstate__(self, state):
+        """Called on pickle load"""
+        self.__dict__.update(state)
+
+    def __del__(self):
+        self.disconnect()
+
+    def disconnect(self):
+        if self._client is not None:
+            self._client.disconnect()
+            self._client = None
+
+    def __len__(self):
+        """Save reads by caching len"""
+        if self._n is None:
+            self._n = len(self.array)
+        return self._n
+
+
+GB100 = (1024**3) * 100
+
+
+class PlasmaStore:
+    def __init__(self, path=DEFAULT_PLASMA_PATH, nbytes: int = GB100):
+
+        self.server = self.start(path, nbytes)
+
+    def __del__(self):
+        self.server.kill()
+
+    @staticmethod
+    def start(path=DEFAULT_PLASMA_PATH, nbytes: int = GB100) -> subprocess.Popen:
+        if not PYARROW_AVAILABLE:
+            raise ImportError("please run pip install pyarrow to use --use_plasma_view")
+        # best practice is to allocate more space than we need. The limitation seems to be the size of /dev/shm
+        _server = subprocess.Popen(["plasma_store", "-m", str(nbytes), "-s", path])
+        plasma.connect(path, num_retries=200)  # If we can't connect we fail immediately
+        return _server
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
new file mode 100644
index 0000000000..ad74784d2d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
@@ -0,0 +1,28 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class PrependDataset(BaseWrapperDataset):
+    def __init__(self, dataset, prepend_getter, ensure_first_token_is=None):
+        super().__init__(dataset)
+        self.prepend_getter = prepend_getter
+        self.ensure_first_token = ensure_first_token_is
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        is_tuple = isinstance(item, tuple)
+        src = item[0] if is_tuple else item
+
+        assert self.ensure_first_token is None or src[0] == self.ensure_first_token
+        prepend_idx = self.prepend_getter(self.dataset, idx)
+        assert isinstance(prepend_idx, int)
+        src[0] = prepend_idx
+        item = tuple((src,) + item[1:]) if is_tuple else src
+        return item
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
new file mode 100644
index 0000000000..fd1331f4c4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import BaseWrapperDataset
+
+
+class PrependTokenDataset(BaseWrapperDataset):
+    def __init__(self, dataset, token=None):
+        super().__init__(dataset)
+        self.token = token
+        if token is not None:
+            self._sizes = np.array(dataset.sizes) + 1
+        else:
+            self._sizes = dataset.sizes
+
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        if self.token is not None:
+            item = torch.cat([item.new([self.token]), item])
+        return item
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    def num_tokens(self, index):
+        n = self.dataset.num_tokens(index)
+        if self.token is not None:
+            n += 1
+        return n
+
+    def size(self, index):
+        n = self.dataset.size(index)
+        if self.token is not None:
+            n += 1
+        return n
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
new file mode 100644
index 0000000000..d054904f41
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class RawLabelDataset(FairseqDataset):
+    def __init__(self, labels):
+        super().__init__()
+        self.labels = labels
+
+    def __getitem__(self, index):
+        return self.labels[index]
+
+    def __len__(self):
+        return len(self.labels)
+
+    def collater(self, samples):
+        return torch.tensor(samples)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
new file mode 100644
index 0000000000..5aac2ba96b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class ReplaceDataset(BaseWrapperDataset):
+    """Replaces tokens found in the dataset by a specified replacement token
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to replace tokens in
+        replace_map(Dictionary[int,int]): map of token to replace -> replacement token
+        offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be
+        as many as the number of objects returned by the underlying dataset __getitem__ method.
+    """
+
+    def __init__(self, dataset, replace_map, offsets):
+        super().__init__(dataset)
+        assert len(replace_map) > 0
+        self.replace_map = replace_map
+        self.offsets = offsets
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        is_tuple = isinstance(item, tuple)
+        srcs = item if is_tuple else [item]
+
+        for offset, src in zip(self.offsets, srcs):
+            for k, v in self.replace_map.items():
+                src_off = src[offset:] if offset >= 0 else src[:offset]
+                src_off.masked_fill_(src_off == k, v)
+
+        item = srcs if is_tuple else srcs[0]
+        return item
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
new file mode 100644
index 0000000000..2d77ed79d7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import numpy as np
+
+from fairseq.data import BaseWrapperDataset, plasma_utils
+
+logger = logging.getLogger(__name__)
+
+
+class ResamplingDataset(BaseWrapperDataset):
+    """Randomly samples from a given dataset at each epoch.
+
+    Sampling is done with or without replacement, depending on the "replace"
+    parameter.
+
+    Optionally, the epoch size can be rescaled. This is potentially desirable
+    to increase per-epoch coverage of the base dataset (since sampling with
+    replacement means that many items in the dataset will be left out). In the
+    case of sampling without replacement, size_ratio should be strictly less
+    than 1.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset on which to sample.
+        weights (List[float]): list of probability weights
+            (default: None, which corresponds to uniform sampling).
+        replace (bool): sampling mode; True for "with replacement", or False
+            for "without replacement" (default: True)
+        size_ratio (float): the ratio to subsample to; must be positive
+            (default: 1.0).
+        batch_by_size (bool): whether or not to batch by sequence length
+            (default: True).
+        seed (int): RNG seed to use (default: 0).
+        epoch (int): starting epoch number (default: 1).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        weights=None,
+        replace=True,
+        size_ratio=1.0,
+        batch_by_size=True,
+        seed=0,
+        epoch=1,
+    ):
+        super().__init__(dataset)
+
+        if weights is None:
+            self.weights = None
+
+        else:
+            assert len(weights) == len(dataset)
+            weights_arr = np.array(weights, dtype=np.float64)
+            weights_arr /= weights_arr.sum()
+            self.weights = plasma_utils.PlasmaArray(weights_arr)
+
+        self.replace = replace
+
+        assert size_ratio > 0.0
+        if not self.replace:
+            assert size_ratio < 1.0
+        self.size_ratio = float(size_ratio)
+        self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int)
+
+        self.batch_by_size = batch_by_size
+        self.seed = seed
+
+        self._cur_epoch = None
+        self._cur_indices = None
+
+        self.set_epoch(epoch)
+
+    def __getitem__(self, index):
+        return self.dataset[self._cur_indices.array[index]]
+
+    def __len__(self):
+        return self.actual_size
+
+    @property
+    def sizes(self):
+        if isinstance(self.dataset.sizes, list):
+            return [s[self._cur_indices.array] for s in self.dataset.sizes]
+        return self.dataset.sizes[self._cur_indices.array]
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(self._cur_indices.array[index])
+
+    def size(self, index):
+        return self.dataset.size(self._cur_indices.array[index])
+
+    def ordered_indices(self):
+        if self.batch_by_size:
+            order = [
+                np.arange(len(self)),
+                self.sizes,
+            ]  # No need to handle `self.shuffle == True`
+            return np.lexsort(order)
+        else:
+            return np.arange(len(self))
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(self._cur_indices.array[indices])
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return False
+
+    def set_epoch(self, epoch):
+        logger.debug("ResamplingDataset.set_epoch: {}".format(epoch))
+        super().set_epoch(epoch)
+
+        if epoch == self._cur_epoch:
+            return
+
+        self._cur_epoch = epoch
+
+        # Generate a weighted sample of indices as a function of the
+        # random seed and the current epoch.
+
+        rng = np.random.RandomState(
+            [
+                42,  # magic number
+                self.seed % (2**32),  # global seed
+                self._cur_epoch,  # epoch index
+            ]
+        )
+        self._cur_indices = plasma_utils.PlasmaArray(
+            rng.choice(
+                len(self.dataset),
+                self.actual_size,
+                replace=self.replace,
+                p=(None if self.weights is None else self.weights.array),
+            )
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
new file mode 100644
index 0000000000..a2915eeb3e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import BaseWrapperDataset
+
+
+class RollDataset(BaseWrapperDataset):
+    def __init__(self, dataset, shifts):
+        super().__init__(dataset)
+        self.shifts = shifts
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        return torch.roll(item, self.shifts)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
new file mode 100644
index 0000000000..2cb7447ea9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
@@ -0,0 +1,160 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from collections import OrderedDict
+from typing import Dict, Sequence
+
+import numpy as np
+
+from . import FairseqDataset, LanguagePairDataset
+
+logger = logging.getLogger(__name__)
+
+
+class RoundRobinZipDatasets(FairseqDataset):
+    """Zip multiple :class:`~fairseq.data.FairseqDataset` instances together.
+
+    Shorter datasets are repeated in a round-robin fashion to match the length
+    of the longest one.
+
+    Args:
+        datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of
+            :class:`~fairseq.data.FairseqDataset` instances.
+        eval_key (str, optional): a key used at evaluation time that causes
+            this instance to pass-through batches from *datasets[eval_key]*.
+    """
+
+    def __init__(self, datasets, eval_key=None):
+        super().__init__()
+        if isinstance(datasets, dict):
+            datasets = OrderedDict(datasets)
+        assert isinstance(datasets, OrderedDict)
+        assert datasets, "Can't make a RoundRobinZipDatasets out of nothing"
+        for dataset in datasets.values():
+            assert isinstance(dataset, FairseqDataset)
+
+        self.datasets = datasets
+        self.eval_key = eval_key
+
+        self.longest_dataset_key = max(datasets, key=lambda k: len(datasets[k]))
+        self.longest_dataset = datasets[self.longest_dataset_key]
+        self._ordered_indices: Dict[str, Sequence[int]] = None
+
+    def _map_index(self, key, index):
+        assert (
+            self._ordered_indices is not None
+        ), "Must call RoundRobinZipDatasets.ordered_indices() first"
+        o = self._ordered_indices[key]
+        return o[index % len(o)]
+
+    def __getitem__(self, index):
+        if self.eval_key is None:
+            return OrderedDict(
+                [
+                    (key, dataset[self._map_index(key, index)])
+                    for key, dataset in self.datasets.items()
+                ]
+            )
+        else:
+            # at evaluation time it's useful to pass-through batches from a single key
+            return self.datasets[self.eval_key][self._map_index(self.eval_key, index)]
+
+    def __len__(self):
+        if self._ordered_indices is not None:
+            return len(self._ordered_indices[self.longest_dataset_key])
+        return len(self.longest_dataset)
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch."""
+        if len(samples) == 0:
+            return None
+        if self.eval_key is None:
+            return OrderedDict(
+                [
+                    (key, dataset.collater([sample[key] for sample in samples]))
+                    for key, dataset in self.datasets.items()
+                ]
+            )
+        else:
+            # at evaluation time it's useful to pass-through batches from a single key
+            return self.datasets[self.eval_key].collater(samples)
+
+    def num_tokens(self, index):
+        """Return an example's length (number of tokens), used for batching."""
+        # TODO make it configurable whether to use max() or sum() here
+        return max(
+            dataset.num_tokens(self._map_index(key, index))
+            for key, dataset in self.datasets.items()
+        )
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return {
+            key: dataset.size(self._map_index(key, index))
+            for key, dataset in self.datasets.items()
+        }
+
+    def ordered_indices(self):
+        """Ordered indices for batching."""
+        if self._ordered_indices is None:
+            # Call the underlying dataset's ordered_indices() here, so that we
+            # get the same random ordering as we would have from using the
+            # underlying sub-datasets directly.
+            self._ordered_indices = OrderedDict(
+                [
+                    (key, dataset.ordered_indices())
+                    for key, dataset in self.datasets.items()
+                ]
+            )
+        return np.arange(len(self))
+
+    def filter_indices_by_size(self, indices, max_positions=None):
+        """
+        Filter each sub-dataset independently, then update the round robin to work
+        on the filtered sub-datasets.
+        """
+
+        def _deep_until_language_pair(dataset):
+            if isinstance(dataset, LanguagePairDataset):
+                return dataset
+            if hasattr(dataset, "tgt_dataset"):
+                return _deep_until_language_pair(dataset.tgt_dataset)
+            if hasattr(dataset, "dataset"):
+                return _deep_until_language_pair(dataset.dataset)
+            raise Exception(f"Don't know how to unwrap this dataset: {dataset}")
+
+        if not isinstance(max_positions, dict):
+            max_positions = {k: max_positions for k in self.datasets.keys()}
+        ignored_some = False
+        for key, dataset in self.datasets.items():
+            dataset = _deep_until_language_pair(dataset)
+            self._ordered_indices[key], ignored = dataset.filter_indices_by_size(
+                self._ordered_indices[key], max_positions[key]
+            )
+            if len(ignored) > 0:
+                ignored_some = True
+                logger.warning(
+                    f"{len(ignored)} samples from {key} have invalid sizes and will be skipped, "
+                    f"max_positions={max_positions[key]}, first few sample ids={ignored[:10]}"
+                )
+        # Since we are modifying in place the _ordered_indices,
+        # it's not possible anymore to return valid ignored indices.
+        # Hopefully the extra debug information print above should be enough to debug.
+        # Ideally we would receive ignore_invalid_inputs so that we could have
+        # a proper error message.
+        return (np.arange(len(self)), [0] if ignored_some else [])
+
+    @property
+    def supports_prefetch(self):
+        return all(
+            getattr(dataset, "supports_prefetch", False)
+            for dataset in self.datasets.values()
+        )
+
+    def prefetch(self, indices):
+        for key, dataset in self.datasets.items():
+            dataset.prefetch([self._map_index(key, index) for index in indices])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
new file mode 100644
index 0000000000..6ebb5d88fe
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+from fairseq.data import data_utils
+
+from . import BaseWrapperDataset
+
+
+class TruncateDataset(BaseWrapperDataset):
+    """Truncate a sequence by returning the first truncation_length tokens"""
+
+    def __init__(self, dataset, truncation_length):
+        super().__init__(dataset)
+        assert truncation_length is not None
+        self.truncation_length = truncation_length
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item_len = item.size(0)
+        if item_len > self.truncation_length:
+            item = item[: self.truncation_length]
+        return item
+
+    @property
+    def sizes(self):
+        return np.minimum(self.dataset.sizes, self.truncation_length)
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+class RandomCropDataset(TruncateDataset):
+    """Truncate a sequence by returning a random crop of truncation_length tokens"""
+
+    def __init__(self, dataset, truncation_length, seed=1):
+        super().__init__(dataset, truncation_length)
+        self.seed = seed
+        self.epoch = 0
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True  # only the crop changes, not item sizes
+
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+
+    def __getitem__(self, index):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            item = self.dataset[index]
+            item_len = item.size(0)
+            excess = item_len - self.truncation_length
+            if excess > 0:
+                start_idx = np.random.randint(0, excess)
+                item = item[start_idx : start_idx + self.truncation_length]
+            return item
+
+
+def maybe_shorten_dataset(
+    dataset,
+    split,
+    shorten_data_split_list,
+    shorten_method,
+    tokens_per_sample,
+    seed,
+):
+    truncate_split = (
+        split in shorten_data_split_list.split(",") or len(shorten_data_split_list) == 0
+    )
+    if shorten_method == "truncate" and truncate_split:
+        dataset = TruncateDataset(dataset, tokens_per_sample)
+    elif shorten_method == "random_crop" and truncate_split:
+        dataset = RandomCropDataset(dataset, tokens_per_sample, seed)
+    return dataset
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
new file mode 100644
index 0000000000..b3890e7279
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+from . import BaseWrapperDataset
+
+
+class SortDataset(BaseWrapperDataset):
+    def __init__(self, dataset, sort_order):
+        super().__init__(dataset)
+        if not isinstance(sort_order, (list, tuple)):
+            sort_order = [sort_order]
+        self.sort_order = sort_order
+
+        assert all(len(so) == len(dataset) for so in sort_order)
+
+    def ordered_indices(self):
+        return np.lexsort(self.sort_order)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
new file mode 100644
index 0000000000..72189bd378
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
@@ -0,0 +1,293 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from . import Dictionary, FairseqDataset, data_utils
+
+
+def collate(
+    samples,
+    pad_idx,
+    eos_idx,
+    vocab,
+    left_pad_source=False,
+    left_pad_target=False,
+    input_feeding=True,
+    pad_to_length=None,
+):
+    assert input_feeding
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=None,  # use eos_idx of each sample instead of vocab.eos()
+            left_pad=left_pad,
+            move_eos_to_beginning=move_eos_to_beginning,
+            pad_to_length=pad_to_length,
+        )
+
+    id = torch.LongTensor([s["id"] for s in samples])
+    src_tokens = merge(
+        "source",
+        left_pad=left_pad_source,
+        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
+    )
+    # sort by descending source length
+    src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
+    src_lengths, sort_order = src_lengths.sort(descending=True)
+    id = id.index_select(0, sort_order)
+    src_tokens = src_tokens.index_select(0, sort_order)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge(
+            "target",
+            left_pad=left_pad_target,
+            pad_to_length=pad_to_length["target"]
+            if pad_to_length is not None
+            else None,
+        )
+        target = target.index_select(0, sort_order)
+        ntokens = sum(len(s["target"]) for s in samples)
+
+        if input_feeding:
+            # we create a shifted version of targets for feeding the
+            # previous output token(s) into the next decoder step
+            prev_output_tokens = merge(
+                "target",
+                left_pad=left_pad_target,
+                move_eos_to_beginning=True,
+                pad_to_length=pad_to_length["target"]
+                if pad_to_length is not None
+                else None,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+    else:
+        ntokens = sum(len(s["source"]) for s in samples)
+
+    batch = {
+        "id": id,
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+        },
+        "target": target,
+        "target_lengths": torch.LongTensor([len(t) for t in target]),
+        "nsentences": samples[0]["source"].size(0),
+        "sort_order": sort_order,
+    }
+    if prev_output_tokens is not None:
+        batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+
+    return batch
+
+
+class SpanMaskedTokensDataset(FairseqDataset):
+    """
+    A wrapper around TokenBlockDataset for T5 dataset.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to wrap
+        vocab (~fairseq.data.Dictionary): vocabulary
+        noise_density (float): fraction of the tokens to select as noise.
+        mean_noise_span_length (float): mean noise span length.
+        shuffle (bool, optional): shuffle the elements before batching.
+          Default: ``True``
+        seed: Seed for random number generator for reproducibility.
+    """
+
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        vocab: Dictionary,
+        noise_density: float,
+        mean_noise_span_length: float,
+        shuffle: bool,
+        seed: int = 1,
+    ):
+        self.dataset = dataset
+        self.vocab = vocab
+        self.seed = seed
+        self.noise_density = noise_density
+        self.mean_noise_span_length = mean_noise_span_length
+        self.shuffle = shuffle
+        self.epoch = 0
+
+    @property
+    def can_reuse_epoch_itr_across_epochs(self):
+        return True  # only the noise changes, not item sizes
+
+    def set_epoch(self, epoch, **unused):
+        self.epoch = epoch
+
+    def __getitem__(self, index):
+        with data_utils.numpy_seed(self.seed, self.epoch, index):
+            item = self.dataset[index]
+            assert item[-1] == self.vocab.eos()
+
+            noise_mask = self.random_spans_noise_mask(len(item))
+
+            source_sentinel_ids = self.create_sentinel_ids(noise_mask.astype(np.int8))
+            source = self.filter_input_ids(item, source_sentinel_ids)
+
+            target_sentinel_ids = self.create_sentinel_ids(
+                (~noise_mask).astype(np.int8)
+            )
+            target = self.filter_input_ids(item, target_sentinel_ids)
+
+        return {
+            "id": index,
+            "source": torch.from_numpy(source),
+            "target": torch.from_numpy(target),
+        }
+
+    def random_spans_noise_mask(self, length):
+
+        """
+        This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
+        Noise mask consisting of random spans of noise tokens.
+        The number of noise tokens and the number of noise spans and non-noise spans
+        are determined deterministically as follows:
+        num_noise_tokens = round(length * noise_density)
+        num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
+        Spans alternate between non-noise and noise, beginning with non-noise.
+        Subject to the above restrictions, all masks are equally likely.
+        Args:
+            length: an int32 scalar (length of the incoming token sequence)
+        Returns:
+            a boolean tensor with shape [length]
+        """
+
+        orig_length = length
+
+        num_noise_tokens = int(np.round(length * self.noise_density))
+        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
+        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
+        num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
+
+        # avoid degeneracy by ensuring positive number of noise spans
+        num_noise_spans = max(num_noise_spans, 1)
+        num_nonnoise_tokens = length - num_noise_tokens
+
+        # pick the lengths of the noise spans and the non-noise spans
+        def _random_segmentation(num_items, num_segments):
+            """
+            Partition a sequence of items randomly into non-empty segments.
+            Args:
+                num_items: an integer scalar > 0
+                num_segments: an integer scalar in [1, num_items]
+            Returns:
+                a Tensor with shape [num_segments] containing positive integers that add up to num_items
+            """
+            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
+            np.random.shuffle(mask_indices)
+            first_in_segment = np.pad(mask_indices, [[1, 0]])
+            segment_id = np.cumsum(first_in_segment)
+            # count length of subsegments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
+            return segment_length
+
+        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
+        nonnoise_span_lengths = _random_segmentation(
+            num_nonnoise_tokens, num_noise_spans
+        )
+
+        interleaved_span_lengths = np.reshape(
+            np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1),
+            [num_noise_spans * 2],
+        )
+        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
+        span_start_indicator = np.zeros((length,), dtype=np.int8)
+        span_start_indicator[span_starts] = True
+        span_num = np.cumsum(span_start_indicator)
+        is_noise = np.equal(span_num % 2, 1)
+
+        return is_noise[:orig_length]
+
+    def create_sentinel_ids(self, mask_indices):
+        """
+        Sentinel ids creation given the indices that should be masked.
+        The start indices of each mask are replaced by the sentinel ids in increasing
+        order. Consecutive mask indices to be deleted are replaced with `-1`.
+        """
+        start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
+
+        sentinel_ids = np.where(
+            start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices
+        )
+        # making sure all sentinel tokens are unique over the example
+        sentinel_ids = np.where(sentinel_ids != 0, len(self.vocab) - sentinel_ids, 0)
+        sentinel_ids -= mask_indices - start_indices
+        return sentinel_ids
+
+    @staticmethod
+    def filter_input_ids(input_ids, sentinel_ids):
+        """
+        Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
+        This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
+        """
+        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
+
+        # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
+        # masked tokens coming after sentinel tokens and should be removed
+        return input_ids_full[input_ids_full >= 0]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples, pad_to_length=None):
+        """
+        Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch of data
+        """
+        return collate(
+            samples,
+            self.vocab.pad(),
+            self.vocab.eos(),
+            self.vocab,
+            pad_to_length=pad_to_length,
+        )
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.dataset.sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.dataset.sizes[index]
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            indices = np.random.permutation(len(self))
+        else:
+            indices = np.arange(len(self))
+        return indices[np.argsort(self.dataset.sizes[indices], kind="mergesort")]
+
+    def prefetch(self, indices):
+        self.src.prefetch(indices)
+        self.tgt.prefetch(indices)
+
+    @property
+    def supports_prefetch(self):
+        return (
+            hasattr(self.src, "supports_prefetch")
+            and self.src.supports_prefetch
+            and hasattr(self.tgt, "supports_prefetch")
+            and self.tgt.supports_prefetch
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
new file mode 100644
index 0000000000..06c4808f0a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
@@ -0,0 +1,307 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+import numpy as np
+import torch
+
+from fairseq.data import FairseqDataset, MonolingualDataset, data_utils
+
+
+class SpeechDLMDataset(FairseqDataset):
+    """The dataset used to train the SpeechDLM model as described in the paper:
+    https://arxiv.org/pdf/2203.16502.pdf
+
+    The input datasets is expected to be a dict over channel names with the values
+    being instances of :class:`~fairseq.data.MonolingualDataset`.
+
+    Each element of SpeechDLMDataset is a dictionary with the following keys:
+        - `id` (int) : index of the item
+        - `source` (OrderedDict[str, Tensor of shape (seq_len,)]) : dictionary over
+            channels with the values containing the input unit tokens
+        - `target_next` (OrderedDict[str, Tensor of shape (seq_len,)]) : dictionary
+            over channels with the values containing the next unit tokens (input
+            tokens shifted by 1).
+            Its value is None if 'next' not in self.targets
+        - `target_edge` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : dictionary
+            over channels with the values containing the edge unit tokens (input tokens
+            deduplicated).
+            Its value is None if 'edge' not in self.targets
+        - `target_duration` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) :
+            dictionary over channels with the values being the durations of the edge units.
+            Its value is None if 'duration' not in targets.
+        - `target_edge_indices` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) :
+            dictionary over channels with the values being the indices of the edge units
+            in the source sequence.
+            Its value is None if neither 'edge' or 'duration in targets.
+
+    Args:
+        datasets (Dict[str, ~fairseq.data.MonolingualDataset]): a dictionary of
+            :class:`~fairseq.data.MonolingualDataset` instances.
+        targets (List[str]): list of the target types that the SpeechDLM model
+            should predict.  Can be one of "next", "edge", "duration".
+        shuffle (bool, optional): shuffle the elements before batching
+            (default: True).
+    """
+
+    def __init__(
+        self, datasets, targets=None, max_target_durations=None, shuffle=False
+    ):
+        super().__init__()
+        if isinstance(datasets, dict):
+            datasets = OrderedDict(datasets)
+        assert isinstance(
+            datasets, OrderedDict
+        ), "datasets is expected to be an instance of Dictionary or OrderedDict"
+        assert datasets, "datasets is None"
+        for dataset in datasets.values():
+            assert isinstance(
+                dataset, MonolingualDataset
+            ), "Each value of datasets is expected to be an instance of MonolingualDataset"
+
+        self.datasets = datasets
+        self.targets = targets
+        if max_target_durations is not None and max_target_durations > 0:
+            self.max_target_durations = max_target_durations
+        else:
+            self.max_target_durations = float("inf")
+        self.sizes = next(iter(datasets.values())).sizes
+        self.vocab = next(iter(datasets.values())).vocab
+        self.length = len(next(iter(datasets.values())))
+        self.shuffle = shuffle
+
+        for channel, dataset in datasets.items():
+            assert (
+                len(dataset) == self.length
+            ), "[{}] length mismatch ({} vs {})".format(
+                channel, len(dataset), self.length
+            )
+            assert (dataset.sizes == self.sizes).all(), "[{}] sizes mismatch".format(
+                channel
+            )
+
+            assert (
+                dataset.vocab.pad() == self.vocab.pad()
+            ), "pad token is expected to be the same"
+            assert (
+                dataset.vocab.eos() == self.vocab.eos()
+            ), "eos token is expected to be the same"
+            assert (
+                dataset.vocab.bos() == self.vocab.bos()
+            ), "bos token is expected to be the same"
+            assert (
+                dataset.vocab.unk() == self.vocab.unk()
+            ), "unk token is expected to be the same"
+
+    def __getitem__(self, index):
+        source = OrderedDict(
+            [
+                (key, dataset[index]["source"])
+                for (key, dataset) in self.datasets.items()
+            ]
+        )
+
+        item = {
+            "id": index,
+            "source": source,
+            "target_next": None,
+            "target_edge": None,
+            "target_duration": None,
+            "target_edge_indices": None,
+        }
+
+        if self.targets is not None:
+            for channel in self.datasets:
+                target = self._get_target(index, channel)
+                for t in target:
+                    if item[f"target_{t}"] is None:
+                        item[f"target_{t}"] = OrderedDict()
+                    item[f"target_{t}"][channel] = target[t]
+
+        return item
+
+    def __len__(self):
+        return self.length
+
+    def _get_target(self, index, channel):
+        """Get target in one of ['next', 'edge', 'duration']
+        - 'next' is the future unit
+        - 'edge' is the edge unit
+        - 'duration' is the duration of the edge unit
+        """
+        if self.targets is not None:
+            target = {}
+            pad_idx = self.vocab.pad()
+            max_dur = self.max_target_durations
+            future_target = self.datasets[channel][index]["target"]
+            if "edge" in self.targets or "duration" in self.targets:
+                edge_units, edge_unit_counts = torch.unique_consecutive(
+                    future_target, return_counts=True
+                )
+                padding_end = edge_units[-1] == pad_idx
+                if padding_end:
+                    edge_units = edge_units[:-1]
+                    edge_unit_counts = edge_unit_counts[:-1]
+                edge_indices = torch.cumsum(edge_unit_counts, 0)
+                edge_indices = torch.cat([torch.tensor([0]), edge_indices[:-1]])
+                target["edge_indices"] = edge_indices
+
+            for t in self.targets:
+                if t == "next":
+                    target[t] = future_target
+                elif t == "edge":
+                    target[t] = edge_units
+                elif t == "duration":
+                    # count the remaining duration of the last edge indices in the next sentence
+                    if not padding_end and index < len(self.datasets[channel]) - 1:
+                        i = 0
+                        next_sentence_target = self.datasets[channel][index + 1][
+                            "target"
+                        ]
+                        while (
+                            next_sentence_target[i] == edge_units[-1]
+                            and edge_unit_counts[-1] + i < max_dur
+                        ):
+                            i += 1
+                        edge_unit_counts[-1] += i
+
+                    # cut off to the maximal threshold
+                    if max_dur:
+                        edge_unit_counts[edge_unit_counts > max_dur] = max_dur
+
+                    target[t] = edge_unit_counts
+                else:
+                    raise Exception("invalid target " + t)
+
+            return target
+
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+
+        Args:
+            samples (List[dict]): samples to collate
+
+        Returns:
+            dict: a mini-batch with the following keys:
+
+                - `id` (LongTensor): example IDs in the original input order
+                - `ntokens` (int): total number of tokens in the batch
+                - `net_input` (dict): the input to the Model, containing keys:
+
+                  - `src_tokens` (OrderedDict[str, LongTensor]): dictionary
+                    over channel with the values being padded 2D Tensor of
+                    samples `source` of shape `(bsz, src_len)`.
+                    Padding will appear on the right.
+                  - `src_lengths` (LongTensor): lengths of source sentences
+                    in the mini-batch
+
+                - `target` (dict): the target of the Model, containing keys:
+
+                  - `next` (OrderedDict[str, LongTensor]): dictionary
+                    over channel with the values being padded 2D Tensor of
+                    batch samples' `target_next` of shape `(bsz, tgt_len)`.
+                    Padding will appear on the right.
+                  - `edge` (OrderedDict[str, LongTensor]): dictionary
+                    over channel with the values being the concatenated
+                    1D Tensor of batch samples' `target_edge` of shape
+                    `(sum of dedup_tgt_len,)`
+                  - `duration` (OrderedDict[str, LongTensor]): dictionary
+                    over channel with the values being the concatenated
+                    1D Tensor of batch samples' `target_duration` of shape
+                    `(sum of dedup_tgt_len,)`
+                  - `edge_indices` (OrderedDict[str, LongTensor]): dictionary
+                    over channel with the values being the concatenated
+                    1D Tensor of batch samples' `target_edge_indices` of
+                    shape `(sum of dedup_tgt_len,)`.
+                    The indices are added to multiplies of batch size
+                    such that they are the actual indices in the flatten
+                    `src_tokens` Tensor
+        """
+        if len(samples) == 0:
+            return {}
+
+        pad_idx = self.vocab.pad()
+        eos_idx = self.vocab.eos()
+
+        def merge(key, max_size=None):
+            if samples[0][key] is None:
+                return None
+            res = OrderedDict()
+            for channel in samples[0][key]:
+                if key in ["source", "target_next"]:
+                    # fill batch of shape: (batch_size, max_size)
+                    res[channel] = data_utils.collate_tokens(
+                        [s[key][channel] for s in samples],
+                        pad_idx,
+                        eos_idx,
+                        left_pad=False,
+                    )
+                elif key in ["target_edge", "target_duration"]:
+                    # concatenate the edge units/duration
+                    res[channel] = torch.cat([s[key][channel] for s in samples])
+                elif key == "target_edge_indices":
+                    # increase the edge indices to the indices in the flatten batch
+                    res[channel] = torch.cat(
+                        [s[key][channel] + i * max_size for i, s in enumerate(samples)]
+                    )
+
+            return res
+
+        src_tokens = merge("source")
+        tgt_next = merge("target_next")
+        tgt_edge = merge("target_edge")
+        tgt_duration = merge("target_duration")
+        tgt_edge_indices = merge(
+            "target_edge_indices", max_size=next(iter(src_tokens.values())).size(-1)
+        )
+        return {
+            "id": torch.LongTensor([s["id"] for s in samples]),
+            "nsentences": len(samples),
+            "ntokens": sum(len(item) for s in samples for item in s["source"].values()),
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": torch.LongTensor(
+                    [next(iter(s["source"].values())).numel() for s in samples]
+                ),
+            },
+            "target": {
+                "next": tgt_next,
+                "edge": tgt_edge,
+                "duration": tgt_duration,
+                "edge_indices": tgt_edge_indices,
+            },
+        }
+
+    def num_tokens(self, index):
+        """Return the number of tokens in a sample. This value is used to
+        enforce ``--max-tokens`` during batching."""
+        return self.sizes[index]
+
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return self.sizes[index]
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)
+
+    @property
+    def supports_prefetch(self):
+        return all(
+            getattr(dataset, "supports_prefetch", False)
+            for dataset in self.datasets.values()
+        )
+
+    def prefetch(self, indices):
+        for key, dataset in self.datasets.items():
+            dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
new file mode 100644
index 0000000000..cae39ba4d2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import BaseWrapperDataset
+
+
+class StripTokenDataset(BaseWrapperDataset):
+    def __init__(self, dataset, id_to_strip):
+        super().__init__(dataset)
+        self.id_to_strip = id_to_strip
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        while len(item) > 0 and item[-1] == self.id_to_strip:
+            item = item[:-1]
+        while len(item) > 0 and item[0] == self.id_to_strip:
+            item = item[1:]
+        return item
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
new file mode 100644
index 0000000000..fe5c7e2ac8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
@@ -0,0 +1,75 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import logging
+
+import numpy as np
+from fairseq.data.data_utils import numpy_seed
+
+from . import BaseWrapperDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+class SubsampleDataset(BaseWrapperDataset):
+    """Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to subsample
+        size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive)
+    """
+
+    def __init__(self, dataset, size_ratio, shuffle=False, seed=None):
+        super().__init__(dataset)
+        assert size_ratio < 1
+        self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
+        with numpy_seed(seed) if seed is not None else contextlib.ExitStack():
+            self.indices = np.random.choice(
+                list(range(len(self.dataset))), self.actual_size, replace=False
+            )
+        self.shuffle = shuffle
+        logger.info(
+            "subsampled dataset from {} to {} (ratio={})".format(
+                len(self.dataset), self.actual_size, size_ratio
+            )
+        )
+
+    def __getitem__(self, index):
+        return self.dataset[self.indices[index]]
+
+    def __len__(self):
+        return self.actual_size
+
+    def collater(self, samples):
+        return self.dataset.collater(samples)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes[self.indices]
+
+    @property
+    def name(self):
+        return self.dataset.name
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(self.indices[index])
+
+    def size(self, index):
+        return self.dataset.size(self.indices[index])
+
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        if self.shuffle:
+            order = [np.random.permutation(len(self))]
+        else:
+            order = [np.arange(len(self))]
+        order.append(self.sizes)
+        return np.lexsort(order)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(self.indices[indices])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
new file mode 100644
index 0000000000..d699f2ea29
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+
+
+class TextCompressionLevel(Enum):
+    none = 0
+    low = 1
+    high = 2
+
+
+class TextCompressor(object):
+    def __init__(
+        self, level: TextCompressionLevel, max_input_byte_length: int = 2**16
+    ):
+        self.level = level
+        self.max_input_length = max_input_byte_length
+
+    def compress(self, text: str) -> bytes:
+        if self.level == TextCompressionLevel.low:
+            import zlib
+
+            # zlib: built-in, fast
+            return zlib.compress(text.encode(), level=0)
+        elif self.level == TextCompressionLevel.high:
+            try:
+                import unishox2
+
+                # unishox2: optimized for short text but slower
+            except ImportError:
+                raise ImportError(
+                    "Please install unishox2 for the text compression feature: "
+                    "pip install unishox2-py3"
+                )
+            assert len(text.encode()) <= self.max_input_length
+            return unishox2.compress(text)[0]
+        else:
+            return text.encode()
+
+    def decompress(self, compressed: bytes) -> str:
+        if self.level == TextCompressionLevel.low:
+            import zlib
+
+            return zlib.decompress(compressed).decode()
+        elif self.level == TextCompressionLevel.high:
+            try:
+                import unishox2
+            except ImportError:
+                raise ImportError(
+                    "Please install unishox2 for the text compression feature: "
+                    "pip install unishox2-py3"
+                )
+            return unishox2.decompress(compressed, self.max_input_length)
+        else:
+            return compressed.decode()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
new file mode 100644
index 0000000000..a414e7ef64
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from fairseq.data import FairseqDataset, plasma_utils
+from fairseq.data.indexed_dataset import best_fitting_int_dtype
+from typing import Tuple
+
+
+class TokenBlockDataset(FairseqDataset):
+    """Break a Dataset of tokens into blocks.
+
+    Args:
+        dataset (~torch.utils.data.Dataset): dataset to break into blocks
+        sizes (List[int]): sentence lengths (required for 'complete' and 'eos')
+        block_size (int): maximum block size (ignored in 'eos' break mode)
+        break_mode (str, optional): Mode used for breaking tokens. Values can
+            be one of:
+            - 'none': break tokens into equally sized blocks (up to block_size)
+            - 'complete': break tokens into blocks (up to block_size) such that
+                blocks contains complete sentences, although block_size may be
+                exceeded if some sentences exceed block_size
+            - 'complete_doc': similar to 'complete' mode, but do not
+                cross document boundaries
+            - 'eos': each block contains one sentence (block_size is ignored)
+        include_targets (bool, optional): return next tokens as targets
+            (default: False).
+        document_sep_len (int, optional): document separator size (required for
+            'complete_doc' break mode). Typically 1 if the sentences have eos
+            and 0 otherwise.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        sizes,
+        block_size,
+        pad,
+        eos,
+        break_mode=None,
+        include_targets=False,
+        document_sep_len=1,
+        use_plasma_view=False,
+        split_path=None,
+        plasma_path=None,
+    ):
+
+        super().__init__()
+        self.dataset = dataset
+        self.pad = pad
+        self.eos = eos
+        self.include_targets = include_targets
+
+        assert len(dataset) > 0
+
+        assert len(dataset) == len(sizes)
+        _sizes, block_to_dataset_index, slice_indices = self._build_slice_indices(
+            sizes, break_mode, document_sep_len, block_size
+        )
+        if use_plasma_view:
+            plasma_id = (block_size, document_sep_len, str(break_mode), len(dataset))
+            self._slice_indices = plasma_utils.PlasmaView(
+                slice_indices, split_path, (plasma_id, 0), plasma_path=plasma_path
+            )
+            self._sizes = plasma_utils.PlasmaView(
+                _sizes, split_path, (plasma_id, 1), plasma_path=plasma_path
+            )
+            self._block_to_dataset_index = plasma_utils.PlasmaView(
+                block_to_dataset_index,
+                split_path,
+                (plasma_id, 2),
+                plasma_path=plasma_path,
+            )
+        else:
+            self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
+            self._sizes = plasma_utils.PlasmaArray(_sizes)
+            self._block_to_dataset_index = plasma_utils.PlasmaArray(
+                block_to_dataset_index
+            )
+
+    @staticmethod
+    def _build_slice_indices(
+        sizes, break_mode, document_sep_len, block_size
+    ) -> Tuple[np.ndarray]:
+        """Use token_block_utils_fast to build arrays for indexing into self.dataset"""
+        try:
+            from fairseq.data.token_block_utils_fast import (
+                _get_slice_indices_fast,
+                _get_block_to_dataset_index_fast,
+            )
+        except ImportError:
+            raise ImportError(
+                "Please build Cython components with: `pip install --editable .` "
+                "or `python setup.py build_ext --inplace`"
+            )
+
+        if isinstance(sizes, list):
+            sizes = np.array(sizes, dtype=np.int64)
+        else:
+            if torch.is_tensor(sizes):
+                sizes = sizes.numpy()
+            sizes = sizes.astype(np.int64)
+
+        break_mode = break_mode if break_mode is not None else "none"
+
+        # For "eos" break-mode, block_size is not required parameters.
+        if break_mode == "eos" and block_size is None:
+            block_size = 0
+
+        slice_indices = _get_slice_indices_fast(
+            sizes, str(break_mode), block_size, document_sep_len
+        )
+        _sizes = slice_indices[:, 1] - slice_indices[:, 0]
+
+        # build index mapping block indices to the underlying dataset indices
+        if break_mode == "eos":
+            # much faster version for eos break mode
+            block_to_dataset_index = np.stack(
+                [
+                    np.arange(len(sizes)),  # starting index in dataset
+                    np.zeros(
+                        len(sizes), dtype=np.compat.long
+                    ),  # starting offset within starting index
+                    np.arange(len(sizes)),  # ending index in dataset
+                ],
+                1,
+            )
+        else:
+            block_to_dataset_index = _get_block_to_dataset_index_fast(
+                sizes,
+                slice_indices,
+            )
+        size_dtype = np.uint16 if block_size < 65535 else np.uint32
+        num_tokens = slice_indices[-1].max()
+        slice_indices_dtype = best_fitting_int_dtype(num_tokens)
+        slice_indices = slice_indices.astype(slice_indices_dtype)
+        _sizes = _sizes.astype(size_dtype)
+        block_to_dataset_index = block_to_dataset_index.astype(slice_indices_dtype)
+        return _sizes, block_to_dataset_index, slice_indices
+
+    @property
+    def slice_indices(self):
+        return self._slice_indices.array
+
+    @property
+    def sizes(self):
+        return self._sizes.array
+
+    @property
+    def block_to_dataset_index(self):
+        return self._block_to_dataset_index.array
+
+    def attr(self, attr: str, index: int):
+        start_ds_idx, _, _ = self.block_to_dataset_index[index]
+        return self.dataset.attr(attr, start_ds_idx)
+
+    def __getitem__(self, index):
+        start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index]
+
+        buffer = torch.cat(
+            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
+        )
+        slice_s, slice_e = self.slice_indices[index]
+        length = slice_e - slice_s
+        s, e = start_offset, start_offset + length
+        item = buffer[s:e]
+
+        if self.include_targets:
+            # *target* is the original sentence (=item)
+            # *source* is shifted right by 1 (maybe left-padded with eos)
+            # *past_target* is shifted right by 2 (left-padded as needed)
+            if s == 0:
+                source = torch.cat([item.new([self.eos]), buffer[0 : e - 1]])
+                past_target = torch.cat(
+                    [item.new([self.pad, self.eos]), buffer[0 : e - 2]]
+                )
+            else:
+                source = buffer[s - 1 : e - 1]
+                if s == 1:
+                    past_target = torch.cat([item.new([self.eos]), buffer[0 : e - 2]])
+                else:
+                    past_target = buffer[s - 2 : e - 2]
+
+            return source, item, past_target
+
+        return item
+
+    def __len__(self):
+        return len(self.slice_indices)
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        self.dataset.prefetch(
+            {
+                ds_idx
+                for index in indices
+                for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]]
+                for ds_idx in range(start_ds_idx, end_ds_idx + 1)
+            }
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
new file mode 100644
index 0000000000..08af4f3061
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
@@ -0,0 +1,187 @@
+# cython: language_level=3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from itertools import chain
+from libc.math cimport ceil
+
+cimport cython
+cimport numpy as np
+
+from libc.stdint cimport int32_t, int64_t
+
+DTYPE = np.int64
+ctypedef int64_t DTYPE_t
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
+    cdef DTYPE_t total_size = sizes.sum()
+    cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
+    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
+    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+    cdef DTYPE_t i
+    cdef DTYPE_t start
+    cdef DTYPE_t end
+    for i in range(length):
+        start = i * block_size
+        end = min(start + block_size, total_size)
+        slice_indices_view[i][0] = start
+        slice_indices_view[i][1] = end
+    return slice_indices
+
+
+cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):
+    """
+    Faster function to convert DTYPE_t list of list.
+    Only fast when there are huge number of rows and low number of columns.
+    """
+    cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)
+    return flat.reshape((len(list_of_list), -1))
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
+    cdef DTYPE_t tok_idx = 0
+    cdef DTYPE_t sz_idx = 0
+    cdef DTYPE_t curr_size = 0
+    cdef DTYPE_t i = 0
+    cdef DTYPE_t length
+    cdef DTYPE_t total_size
+    cdef DTYPE_t[:] sizes_view = sizes
+    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
+    cdef list slice_indices_list = []
+
+    if break_mode is None or break_mode == 'none':
+        slice_indices = _get_slice_indices_none_mode(sizes, block_size)
+    elif break_mode == 'complete':
+        while sz_idx < len(sizes_view):
+            if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
+                curr_size += sizes_view[sz_idx]
+                sz_idx += 1
+            else:
+                slice_indices_list.append((tok_idx, tok_idx + curr_size))
+                tok_idx += curr_size
+                curr_size = 0
+        if curr_size > 0:
+            slice_indices_list.append((tok_idx, tok_idx + curr_size))
+        slice_indices = _fast_convert_to_np_array(slice_indices_list)
+    elif break_mode == 'complete_doc':
+        while sz_idx < len(sizes_view):
+            if (
+                (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
+                # an empty sentence indicates end-of-document:
+                and sizes_view[sz_idx] != document_sep_len
+            ):
+                curr_size += sizes_view[sz_idx]
+                sz_idx += 1
+            else:
+                # Only keep non-empty documents.
+                if curr_size > 1:
+                    slice_indices_list.append((tok_idx, tok_idx + curr_size))
+                tok_idx += curr_size
+                curr_size = 0
+                if sizes_view[sz_idx] == document_sep_len:
+                    tok_idx += sizes_view[sz_idx]
+                    sz_idx += 1
+        if curr_size > 1:
+            slice_indices_list.append((tok_idx, tok_idx + curr_size))
+        slice_indices = _fast_convert_to_np_array(slice_indices_list)
+    elif break_mode == 'eos':
+        slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
+        cumsum = sizes.cumsum(axis=0)
+        slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
+        slice_indices[:, 1] = cumsum
+    else:
+        raise ValueError('Invalid break_mode: ' + break_mode)
+    return slice_indices
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):
+    cdef DTYPE_t start_ds_idx
+    cdef DTYPE_t start_offset
+    cdef DTYPE_t end_ds_idx
+    cdef DTYPE_t i
+    cdef DTYPE_t s
+    cdef DTYPE_t e
+    cdef DatasetSearcher ds = DatasetSearcher(sizes)
+    cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
+    cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
+    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
+    cdef Py_ssize_t x_max = slice_indices.shape[0]
+
+    for i in range(x_max):
+        s = slice_indices_view[i][0]
+        e = slice_indices_view[i][1]
+        ds.seek(s)
+        start_ds_idx = ds.current_index
+        start_offset = ds.current_offset
+        if e <= s:
+            end_ds_idx = start_ds_idx
+        else:
+            ds.seek(e - 1)
+            end_ds_idx = ds.current_index
+        block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
+        block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
+        block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
+    return block_to_dataset_index
+
+
+cdef class DatasetSearcher(object):
+    """Helper for mapping "flat" indices to indices and offsets in an
+    underlying dataset."""
+    cdef DTYPE_t current_i
+    cdef DTYPE_t current_offset
+    cdef DTYPE_t current_index
+    cdef DTYPE_t[:] sizes
+
+    def __init__(self, DTYPE_t[:] sizes):
+        self.sizes = sizes
+        self.reset()
+
+    cdef reset(self):
+        self.current_offset = 0     # offset within current index in underlying dataset
+        self.current_i = 0          # "flat" index
+        self.current_index = 0      # index in underlying dataset
+
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.nonecheck(False)
+    cdef int step(self, DTYPE_t i):
+        cdef DTYPE_t to_consume
+        cdef DTYPE_t remaining
+        if i < self.current_i:
+            self.reset()
+        if i > self.current_i:
+            to_consume = i - self.current_i
+            remaining = self.sizes[self.current_index] - self.current_offset
+            if remaining > to_consume:
+                self.current_offset += to_consume
+                self.current_i += to_consume
+            else:
+                assert remaining >= 0
+                self.current_i += remaining
+                self.current_index += 1
+                self.current_offset = 0
+                return 1
+        return 0
+
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    @cython.nonecheck(False)
+    cdef seek(self, DTYPE_t i):
+        cdef int not_done = 1
+        while not_done == 1:
+            not_done = self.step(i)
+        assert self.current_i == i
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
new file mode 100644
index 0000000000..effa127d50
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from torch.utils.data.dataloader import default_collate
+
+from fairseq.data import ConcatDataset
+
+logger = logging.getLogger(__name__)
+
+
+class TransformEosConcatLangPairDataset(ConcatDataset):
+    """
+    It is a combination of TransformEosLangPairDataset and ConcatDataset for multiple LangPairDataset datasets.
+    Assume all datasets share the same src_eos, tgt_bos, left_pad_source and left_pad_target
+    """
+
+    def __init__(
+        self,
+        datasets,
+        src_eos,
+        tgt_bos,
+        new_src_eos=None,
+        new_tgt_bos=None,
+    ):
+        super().__init__(datasets)
+        if new_src_eos is not None and new_src_eos != []:
+            assert len(new_src_eos) == len(datasets)
+        else:
+            new_src_eos = []
+        if new_tgt_bos is not None and new_tgt_bos != []:
+            assert len(new_tgt_bos) == len(datasets)
+        else:
+            new_tgt_bos = []
+        self.src_eos = src_eos
+        self.tgt_bos = tgt_bos
+        self.new_src_eos = (
+            torch.LongTensor(new_src_eos).cpu() if len(new_src_eos) > 0 else []
+        )
+        self.new_tgt_bos = (
+            torch.LongTensor(new_tgt_bos).cpu() if len(new_tgt_bos) > 0 else []
+        )
+        self.left_pad_source = self.is_left_pad_source(datasets)
+        self.left_pad_target = self.is_left_pad_target(datasets)
+        self.pad_idx = self.src_dict_pad()
+
+    def src_dict_pad(self):
+        if hasattr(self.datasets[0], "src_dict"):
+            return self.datasets[0].src_dict.pad()
+        if hasattr(self.datasets[0], "dataset"):
+            return self.datasets[0].dataset.src_dict.pad()
+        raise NotImplementedError("No src_dict is found")
+
+    def __getitem__(self, idx):
+        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
+        return dataset_idx, self.datasets[dataset_idx][sample_idx]
+
+    def is_left_pad_source(self, datasets):
+        def _left_pad_source(ds):
+            if hasattr(ds, "left_pad_source"):
+                return ds.left_pad_source
+            if hasattr(ds, "dataset"):
+                return _left_pad_source(ds.dataset)
+            logger.warn(f"{type(ds)} has no left_pad_source, using default True")
+            return True
+
+        left_pad_source = _left_pad_source(datasets[0])
+        for ds in datasets:
+            if left_pad_source != _left_pad_source(ds):
+                raise ValueError("Different left_pad_source setting detected!")
+        return left_pad_source
+
+    def is_left_pad_target(self, datasets):
+        def _left_pad_target(ds):
+            if hasattr(ds, "left_pad_target"):
+                return ds.left_pad_target
+            if hasattr(ds, "dataset"):
+                return _left_pad_target(ds.dataset)
+            logger.warn(f"{type(ds)} has no left_pad_target, using default False")
+            return False
+
+        left_pad_target = _left_pad_target(datasets[0])
+        for ds in datasets:
+            if left_pad_target != _left_pad_target(ds):
+                raise ValueError("Different left_pad_target setting detected!")
+        return left_pad_target
+
+    def collater(self, samples, **extra_args):
+        if len(samples) == 0:
+            return samples
+
+        dataset_ids = [s[0] for s in samples]
+        samples = [s[1] for s in samples]
+
+        if hasattr(self.datasets[0], "collater"):
+            samples = self.datasets[0].collater(samples, **extra_args)
+        else:
+            samples = default_collate(samples, **extra_args)
+
+        if len(self.new_src_eos) > 0:
+            if self.left_pad_source:
+                assert (
+                    samples["net_input"]["src_tokens"][:, -1] != self.src_eos
+                ).sum() == 0
+                samples["net_input"]["src_tokens"][:, -1] = self.new_src_eos[
+                    dataset_ids
+                ]
+
+            else:
+                eos_idx = samples["net_input"]["src_lengths"] - 1
+                assert (
+                    samples["net_input"]["src_tokens"][
+                        torch.arange(eos_idx.size(0)), eos_idx
+                    ]
+                    != self.src_eos
+                ).sum() == 0
+                samples["net_input"]["src_tokens"].scatter_(
+                    1, eos_idx.view(-1, 1), self.new_src_eos[dataset_ids].view(-1, 1)
+                )
+
+        if len(self.new_tgt_bos) > 0 and "prev_output_tokens" in samples["net_input"]:
+            if self.left_pad_target:
+                # TODO: support different padding direction on target side
+                raise NotImplementedError(
+                    "TransformEosLangPairDataset does not implement --left-pad-target True option"
+                )
+            else:
+                assert (
+                    samples["net_input"]["prev_output_tokens"][:, 0] != self.tgt_bos
+                ).sum() == 0
+                samples["net_input"]["prev_output_tokens"][:, 0] = self.new_tgt_bos[
+                    dataset_ids
+                ]
+
+        return samples
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
new file mode 100644
index 0000000000..fb14ff018e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from . import FairseqDataset
+
+
+class TransformEosDataset(FairseqDataset):
+    """A :class:`~fairseq.data.FairseqDataset` wrapper that appends/prepends/strips EOS.
+
+    Note that the transformation is applied in :func:`collater`.
+
+    Args:
+        dataset (~fairseq.data.FairseqDataset): dataset to wrap
+        eos (int): index of the end-of-sentence symbol
+        append_eos_to_src (bool, optional): append EOS to the end of src
+        remove_eos_from_src (bool, optional): remove EOS from the end of src
+        append_eos_to_tgt (bool, optional): append EOS to the end of tgt
+        remove_eos_from_tgt (bool, optional): remove EOS from the end of tgt
+    """
+
+    def __init__(
+        self,
+        dataset,
+        eos,
+        append_eos_to_src=False,
+        remove_eos_from_src=False,
+        append_eos_to_tgt=False,
+        remove_eos_from_tgt=False,
+        has_target=True,
+    ):
+        if not isinstance(dataset, FairseqDataset):
+            raise ValueError("dataset must be an instance of FairseqDataset")
+        if append_eos_to_src and remove_eos_from_src:
+            raise ValueError("cannot combine append_eos_to_src and remove_eos_from_src")
+        if append_eos_to_tgt and remove_eos_from_tgt:
+            raise ValueError("cannot combine append_eos_to_tgt and remove_eos_from_tgt")
+
+        self.dataset = dataset
+        self.eos = torch.LongTensor([eos])
+        self.append_eos_to_src = append_eos_to_src
+        self.remove_eos_from_src = remove_eos_from_src
+        self.append_eos_to_tgt = append_eos_to_tgt
+        self.remove_eos_from_tgt = remove_eos_from_tgt
+        self.has_target = has_target
+
+        # precompute how we should adjust the reported sizes
+        self._src_delta = 0
+        self._src_delta += 1 if append_eos_to_src else 0
+        self._src_delta -= 1 if remove_eos_from_src else 0
+        self._tgt_delta = 0
+        self._tgt_delta += 1 if append_eos_to_tgt else 0
+        self._tgt_delta -= 1 if remove_eos_from_tgt else 0
+
+        self._checked_src = False
+        self._checked_tgt = False
+
+    def _check_src(self, src, expect_eos):
+        if not self._checked_src:
+            assert (src[-1] == self.eos[0]) == expect_eos
+            self._checked_src = True
+
+    def _check_tgt(self, tgt, expect_eos):
+        if self.has_target and not self._checked_tgt:
+            assert (tgt[-1] == self.eos[0]) == expect_eos
+            self._checked_tgt = True
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples):
+        def transform(item):
+            if self.append_eos_to_src:
+                self.eos = self.eos.to(device=item["source"].device)
+                self._check_src(item["source"], expect_eos=False)
+                item["source"] = torch.cat([item["source"], self.eos])
+            if self.remove_eos_from_src:
+                self.eos = self.eos.to(device=item["source"].device)
+                self._check_src(item["source"], expect_eos=True)
+                item["source"] = item["source"][:-1]
+            if self.append_eos_to_tgt:
+                self.eos = self.eos.to(device=item["target"].device)
+                self._check_tgt(item["target"], expect_eos=False)
+                item["target"] = torch.cat([item["target"], self.eos])
+            if self.remove_eos_from_tgt:
+                self.eos = self.eos.to(device=item["target"].device)
+                self._check_tgt(item["target"], expect_eos=True)
+                item["target"] = item["target"][:-1]
+            return item
+
+        samples = list(map(transform, samples))
+        return self.dataset.collater(samples)
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        if self.has_target:
+            src_len, tgt_len = self.dataset.size(index)
+            return (src_len + self._src_delta, tgt_len + self._tgt_delta)
+        else:
+            return self.dataset.size(index)
+
+    def ordered_indices(self):
+        # NOTE: we assume that the ordering does not change based on the
+        # addition or removal of eos
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
new file mode 100644
index 0000000000..d8b2109014
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Optional
+
+import torch
+
+from . import FairseqDataset
+
+
+class TransformEosLangPairDataset(FairseqDataset):
+    """A :class:`~fairseq.data.FairseqDataset` wrapper that transform bos on
+    collated samples of language pair dataset.
+
+    Note that the transformation is applied in :func:`collater`.
+
+    Args:
+        dataset (~fairseq.data.FairseqDataset): dataset that collates sample into
+            LanguagePairDataset schema
+        src_eos (int): original source end-of-sentence symbol index to be replaced
+        new_src_eos (int, optional): new end-of-sentence symbol index to replace source eos symbol
+        tgt_bos (int, optional): original target beginning-of-sentence symbol index to be replaced
+        new_tgt_bos (int, optional): new beginning-of-sentence symbol index to replace at the
+            beginning of 'prev_output_tokens'
+    """
+
+    def __init__(
+        self,
+        dataset: FairseqDataset,
+        src_eos: int,
+        new_src_eos: Optional[int] = None,
+        tgt_bos: Optional[int] = None,
+        new_tgt_bos: Optional[int] = None,
+    ):
+        self.dataset = dataset
+        self.src_eos = src_eos
+        self.new_src_eos = new_src_eos
+        self.tgt_bos = tgt_bos
+        self.new_tgt_bos = new_tgt_bos
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def collater(self, samples, **extra_args):
+        samples = self.dataset.collater(samples, **extra_args)
+        if len(samples) == 0:
+            return samples
+
+        if "net_input" not in samples:
+            return samples
+
+        if self.new_src_eos is not None:
+            if self.dataset.left_pad_source:
+                assert (
+                    samples["net_input"]["src_tokens"][:, -1] != self.src_eos
+                ).sum() == 0
+                samples["net_input"]["src_tokens"][:, -1] = self.new_src_eos
+            else:
+                eos_idx = samples["net_input"]["src_lengths"] - 1
+                assert (
+                    samples["net_input"]["src_tokens"][
+                        torch.arange(eos_idx.size(0)), eos_idx
+                    ]
+                    != self.src_eos
+                ).sum() == 0
+                eos_idx = eos_idx.resize_(len(samples["net_input"]["src_lengths"]), 1)
+                samples["net_input"]["src_tokens"].scatter_(
+                    1, eos_idx, self.new_src_eos
+                )
+
+        if (
+            self.new_tgt_bos is not None
+            and "prev_output_tokens" in samples["net_input"]
+        ):
+            if self.dataset.left_pad_target:
+                # TODO: support different padding direction on target side
+                raise NotImplementedError(
+                    "TransformEosLangPairDataset does not implement --left-pad-target True option"
+                )
+            else:
+                assert (
+                    samples["net_input"]["prev_output_tokens"][:, 0] != self.tgt_bos
+                ).sum() == 0
+                samples["net_input"]["prev_output_tokens"][:, 0] = self.new_tgt_bos
+
+        return samples
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    @property
+    def sizes(self):
+        # dataset.sizes can be a dynamically computed sizes:
+        return self.dataset.sizes
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    @property
+    def supports_prefetch(self):
+        return getattr(self.dataset, "supports_prefetch", False)
+
+    def prefetch(self, indices):
+        return self.dataset.prefetch(indices)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
new file mode 100644
index 0000000000..25408d28ec
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .configs import FairseqDataclass
+from .constants import ChoiceEnum
+
+
+__all__ = [
+    "FairseqDataclass",
+    "ChoiceEnum",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
new file mode 100644
index 0000000000..af957fec64
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
@@ -0,0 +1,1147 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import sys
+from dataclasses import _MISSING_TYPE, dataclass, field
+from typing import Any, List, Optional
+
+import torch
+from omegaconf import II, MISSING
+
+from fairseq.dataclass.constants import (
+    DATASET_IMPL_CHOICES,
+    DDP_BACKEND_CHOICES,
+    DDP_COMM_HOOK_CHOICES,
+    GENERATION_CONSTRAINTS_CHOICES,
+    GENERATION_DECODING_FORMAT_CHOICES,
+    LOG_FORMAT_CHOICES,
+    PIPELINE_CHECKPOINT_CHOICES,
+    PRINT_ALIGNMENT_CHOICES,
+    ZERO_SHARDING_CHOICES,
+)
+
+
+@dataclass
+class FairseqDataclass:
+    """fairseq base dataclass that supported fetching attributes and metas"""
+
+    _name: Optional[str] = None
+
+    @staticmethod
+    def name():
+        return None
+
+    def _get_all_attributes(self) -> List[str]:
+        return [k for k in self.__dataclass_fields__.keys()]
+
+    def _get_meta(
+        self, attribute_name: str, meta: str, default: Optional[Any] = None
+    ) -> Any:
+        return self.__dataclass_fields__[attribute_name].metadata.get(meta, default)
+
+    def _get_name(self, attribute_name: str) -> str:
+        return self.__dataclass_fields__[attribute_name].name
+
+    def _get_default(self, attribute_name: str) -> Any:
+        if hasattr(self, attribute_name):
+            if str(getattr(self, attribute_name)).startswith("${"):
+                return str(getattr(self, attribute_name))
+            elif str(self.__dataclass_fields__[attribute_name].default).startswith(
+                "${"
+            ):
+                return str(self.__dataclass_fields__[attribute_name].default)
+            elif (
+                getattr(self, attribute_name)
+                != self.__dataclass_fields__[attribute_name].default
+            ):
+                return getattr(self, attribute_name)
+
+        f = self.__dataclass_fields__[attribute_name]
+        if not isinstance(f.default_factory, _MISSING_TYPE):
+            return f.default_factory()
+        return f.default
+
+    def _get_type(self, attribute_name: str) -> Any:
+        return self.__dataclass_fields__[attribute_name].type
+
+    def _get_help(self, attribute_name: str) -> Any:
+        return self._get_meta(attribute_name, "help")
+
+    def _get_argparse_const(self, attribute_name: str) -> Any:
+        return self._get_meta(attribute_name, "argparse_const")
+
+    def _get_argparse_alias(self, attribute_name: str) -> Any:
+        return self._get_meta(attribute_name, "argparse_alias")
+
+    def _get_choices(self, attribute_name: str) -> Any:
+        return self._get_meta(attribute_name, "choices")
+
+    @classmethod
+    def from_namespace(cls, args):
+        if isinstance(args, cls):
+            return args
+        else:
+            config = cls()
+            for k in config.__dataclass_fields__.keys():
+                if k.startswith("_"):
+                    # private member, skip
+                    continue
+                if hasattr(args, k):
+                    setattr(config, k, getattr(args, k))
+
+            return config
+
+
+@dataclass
+class CommonConfig(FairseqDataclass):
+    # This is the core dataclass including common parameters shared by all different jobs. Please append your params to other dataclasses if they were
+    # used for a particular purpose or task, such as those dedicated for `distributed training`, `optimization`, etc.
+    no_progress_bar: bool = field(
+        default=False, metadata={"help": "disable progress bar"}
+    )
+    log_interval: int = field(
+        default=100,
+        metadata={
+            "help": "log progress every N batches (when progress bar is disabled)"
+        },
+    )
+    log_format: Optional[LOG_FORMAT_CHOICES] = field(
+        default=None, metadata={"help": "log format to use"}
+    )
+    log_file: Optional[str] = field(
+        default=None, metadata={"help": "log file to copy metrics to."}
+    )
+    aim_repo: Optional[str] = field(
+        default=None,
+        metadata={"help": "path to Aim repository"},
+    )
+    aim_run_hash: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Aim run hash. If skipped, creates or continues run "
+            "based on save_dir"
+        },
+    )
+    tensorboard_logdir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "path to save logs for tensorboard, should match --logdir "
+            "of running tensorboard (default: no tensorboard logging)"
+        },
+    )
+    wandb_project: Optional[str] = field(
+        default=None,
+        metadata={"help": "Weights and Biases project name to use for logging"},
+    )
+    azureml_logging: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Log scalars to AzureML context"},
+    )
+    seed: int = field(
+        default=1, metadata={"help": "pseudo random number generator seed"}
+    )
+    cpu: bool = field(default=False, metadata={"help": "use CPU instead of CUDA"})
+    tpu: bool = field(default=False, metadata={"help": "use TPU instead of CUDA"})
+    bf16: bool = field(default=False, metadata={"help": "use bfloat16; implies --tpu"})
+    memory_efficient_bf16: bool = field(
+        default=False,
+        metadata={
+            "help": "use a memory-efficient version of BF16 training; implies --bf16"
+        },
+    )
+    fp16: bool = field(default=False, metadata={"help": "use FP16"})
+    memory_efficient_fp16: bool = field(
+        default=False,
+        metadata={
+            "help": "use a memory-efficient version of FP16 training; implies --fp16"
+        },
+    )
+    fp16_no_flatten_grads: bool = field(
+        default=False, metadata={"help": "don't flatten FP16 grads tensor"}
+    )
+    fp16_init_scale: int = field(
+        default=2**7, metadata={"help": "default FP16 loss scale"}
+    )
+    fp16_scale_window: Optional[int] = field(
+        default=None,
+        metadata={"help": "number of updates before increasing loss scale"},
+    )
+    fp16_scale_tolerance: float = field(
+        default=0.0,
+        metadata={
+            "help": "pct of updates that can overflow before decreasing the loss scale"
+        },
+    )
+    on_cpu_convert_precision: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, the floating point conversion to fp16/bf16 runs on CPU. "
+            "This reduces bus transfer time and GPU memory usage."
+        },
+    )
+    min_loss_scale: float = field(
+        default=1e-4,
+        metadata={
+            "help": "minimum FP16/AMP loss scale, after which training is stopped"
+        },
+    )
+    threshold_loss_scale: Optional[float] = field(
+        default=None, metadata={"help": "threshold FP16 loss scale from below"}
+    )
+    amp: bool = field(default=False, metadata={"help": "use automatic mixed precision"})
+    amp_batch_retries: int = field(
+        default=2,
+        metadata={
+            "help": "number of retries of same batch after reducing loss scale with AMP"
+        },
+    )
+    amp_init_scale: int = field(
+        default=2**7, metadata={"help": "default AMP loss scale"}
+    )
+    amp_scale_window: Optional[int] = field(
+        default=None,
+        metadata={"help": "number of updates before increasing AMP loss scale"},
+    )
+    user_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "path to a python module containing custom extensions (tasks and/or architectures)"
+        },
+    )
+    empty_cache_freq: int = field(
+        default=0,
+        metadata={"help": "how often to clear the PyTorch CUDA cache (0 to disable)"},
+    )
+    all_gather_list_size: int = field(
+        default=16384,
+        metadata={"help": "number of bytes reserved for gathering stats from workers"},
+    )
+    model_parallel_size: int = field(
+        default=1, metadata={"help": "total number of GPUs to parallelize model over"}
+    )
+    quantization_config_path: Optional[str] = field(
+        default=None, metadata={"help": "path to quantization config file"}
+    )
+    profile: bool = field(
+        default=False, metadata={"help": "enable autograd profiler emit_nvtx"}
+    )
+    reset_logging: bool = field(
+        default=False,
+        metadata={
+            "help": "when using Hydra, reset the logging at the beginning of training"
+        },
+    )
+    suppress_crashes: bool = field(
+        default=False,
+        metadata={
+            "help": "suppress crashes when training with the hydra_train entry point so that the "
+            "main method can return a value (useful for sweeps)"
+        },
+    )
+    use_plasma_view: bool = field(
+        default=False, metadata={"help": "Store indices and sizes in shared memory"}
+    )
+    plasma_path: Optional[str] = field(
+        default="/tmp/plasma",
+        metadata={
+            "help": "path to run plasma_store, defaults to /tmp/plasma. Paths outside /tmp tend to fail."
+        },
+    )
+
+
+@dataclass
+class DistributedTrainingConfig(FairseqDataclass):
+    distributed_world_size: int = field(
+        default=max(1, torch.cuda.device_count()),
+        metadata={
+            "help": "total number of GPUs across all nodes (default: all visible GPUs)"
+        },
+    )
+    distributed_num_procs: Optional[int] = field(
+        default=max(1, torch.cuda.device_count()),
+        metadata={
+            "help": "total number of processes to fork (default: all visible GPUs)"
+        },
+    )
+    distributed_rank: Optional[int] = field(
+        default=0, metadata={"help": "rank of the current worker"}
+    )
+    distributed_backend: str = field(
+        default="nccl", metadata={"help": "distributed backend"}
+    )
+    distributed_init_method: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "typically tcp://hostname:port that will be used to "
+            "establish initial connetion"
+        },
+    )
+    distributed_port: int = field(
+        default=-1,
+        metadata={
+            "help": "port number (not required if using --distributed-init-method)"
+        },
+    )
+    device_id: int = field(
+        default=os.getenv("LOCAL_RANK", 0),
+        metadata={
+            "help": "which GPU to use (by default looks for $LOCAL_RANK, usually configured automatically)",
+            "argparse_alias": "--local_rank",
+        },
+    )
+    distributed_no_spawn: bool = field(
+        default=False,
+        metadata={
+            "help": "do not spawn multiple processes even if multiple GPUs are visible"
+        },
+    )
+    ddp_backend: DDP_BACKEND_CHOICES = field(
+        default="pytorch_ddp", metadata={"help": "DistributedDataParallel backend"}
+    )
+    ddp_comm_hook: DDP_COMM_HOOK_CHOICES = field(
+        default="none", metadata={"help": "communication hook"}
+    )
+    bucket_cap_mb: int = field(
+        default=25, metadata={"help": "bucket size for reduction"}
+    )
+    fix_batches_to_gpus: bool = field(
+        default=False,
+        metadata={
+            "help": "don't shuffle batches between GPUs; this reduces overall "
+            "randomness and may affect precision but avoids the cost of re-reading the data"
+        },
+    )
+    find_unused_parameters: bool = field(
+        default=False,
+        metadata={
+            "help": "disable unused parameter detection (not applicable to "
+            "--ddp-backend=legacy_ddp)"
+        },
+    )
+    gradient_as_bucket_view: bool = field(
+        default=False,
+        metadata={
+            "help": "when set to True, gradients will be views pointing to different offsets of allreduce communication buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients size. "
+            "--gradient-as-bucket-view=gradient_as_bucket_view)"
+        },
+    )
+    fast_stat_sync: bool = field(
+        default=False,
+        metadata={"help": "[deprecated] this is now defined per Criterion"},
+    )
+    heartbeat_timeout: int = field(
+        default=-1,
+        metadata={
+            "help": "kill the job if no progress is made in N seconds; "
+            "set to -1 to disable"
+        },
+    )
+    broadcast_buffers: bool = field(
+        default=False,
+        metadata={
+            "help": "Copy non-trainable parameters between GPUs, such as "
+            "batchnorm population statistics"
+        },
+    )
+    slowmo_momentum: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "SlowMo momentum term; by default use 0.0 for 16 GPUs, "
+            "0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs"
+        },
+    )
+    slowmo_base_algorithm: str = field(
+        default="localsgd",
+        metadata={
+            "help": "Base algorithm. Either 'localsgd' or 'sgp'. Please refer "
+            "to the documentation of 'slowmo_base_algorithm' parameter in "
+            "https://fairscale.readthedocs.io/en/latest/api/experimental/nn/slowmo_ddp.html "
+            "for more details"
+        },
+    )
+    localsgd_frequency: int = field(
+        default=3, metadata={"help": "Local SGD allreduce frequency"}
+    )
+    nprocs_per_node: int = field(
+        default=max(1, torch.cuda.device_count()),
+        metadata={
+            "help": "number of GPUs in each node. An allreduce operation across GPUs in "
+            "a node is very fast. Hence, we do allreduce across GPUs in a node, "
+            "and gossip across different nodes"
+        },
+    )
+    pipeline_model_parallel: bool = field(
+        default=False,
+        metadata={"help": "if set, use pipeline model parallelism across GPUs"},
+    )
+    pipeline_balance: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "partition the model into N_K pieces, where each piece "
+            "contains N_i layers. The sum(args.pipeline_balance) "
+            "should equal the total number of layers in the model"
+        },
+    )
+    pipeline_devices: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "a list of device indices indicating which device to place "
+            "each of the N_K partitions. The length of this list should "
+            "equal the length of the --pipeline-balance argument"
+        },
+    )
+    pipeline_chunks: Optional[int] = field(
+        default=0, metadata={"help": "microbatch count for pipeline model parallelism"}
+    )
+    pipeline_encoder_balance: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "partition the pipeline parallel encoder into N_K pieces, where each piece "
+            "contains N_i layers. The sum(args.pipeline_encoder_balance) "
+            "should equal the total number of encoder layers in the model"
+        },
+    )
+    pipeline_encoder_devices: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "a list of device indices indicating which device to place "
+            "each of the N_K partitions. The length of this list should "
+            "equal the length of the --pipeline-encoder-balance argument"
+        },
+    )
+    pipeline_decoder_balance: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "partition the pipeline parallel decoder into N_K pieces, where each piece "
+            "contains N_i layers. The sum(args.pipeline_decoder_balance) "
+            "should equal the total number of decoder layers in the model"
+        },
+    )
+    pipeline_decoder_devices: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "a list of device indices indicating which device to place "
+            "each of the N_K partitions. The length of this list should "
+            "equal the length of the --pipeline-decoder-balance argument"
+        },
+    )
+    pipeline_checkpoint: PIPELINE_CHECKPOINT_CHOICES = field(
+        default="never",
+        metadata={"help": "checkpointing mode for pipeline model parallelism"},
+    )
+    zero_sharding: ZERO_SHARDING_CHOICES = field(
+        default="none", metadata={"help": "ZeRO sharding"}
+    )
+    fp16: bool = II("common.fp16")
+    memory_efficient_fp16: bool = II("common.memory_efficient_fp16")
+    tpu: bool = II("common.tpu")
+    # configuration for --ddp-backend=fully_sharded
+    no_reshard_after_forward: bool = field(
+        default=False,
+        metadata={"help": "don't reshard parameters after forward pass"},
+    )
+    fp32_reduce_scatter: bool = field(
+        default=False,
+        metadata={"help": "reduce-scatter grads in FP32"},
+    )
+    cpu_offload: bool = field(
+        default=False, metadata={"help": "offload FP32 params to CPU"}
+    )
+    use_sharded_state: bool = field(
+        default=False,
+        metadata={"help": "use sharded checkpoint files"},
+    )
+    not_fsdp_flatten_parameters: bool = field(
+        default=False,
+        metadata={"help": "not flatten parameter param for fsdp"},
+    )
+
+
+@dataclass
+class DatasetConfig(FairseqDataclass):
+    num_workers: int = field(
+        default=1, metadata={"help": "how many subprocesses to use for data loading"}
+    )
+    skip_invalid_size_inputs_valid_test: bool = field(
+        default=False,
+        metadata={"help": "ignore too long or too short lines in valid and test set"},
+    )
+    max_tokens: Optional[int] = field(
+        default=None, metadata={"help": "maximum number of tokens in a batch"}
+    )
+    batch_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "number of examples in a batch",
+            "argparse_alias": "--max-sentences",
+        },
+    )
+    required_batch_size_multiple: int = field(
+        default=8, metadata={"help": "batch size will be a multiplier of this value"}
+    )
+    required_seq_len_multiple: int = field(
+        default=1,
+        metadata={
+            "help": "maximum sequence length in batch will be a multiplier of this value"
+        },
+    )
+    dataset_impl: Optional[DATASET_IMPL_CHOICES] = field(
+        default=None, metadata={"help": "output dataset implementation"}
+    )
+    data_buffer_size: int = field(
+        default=10, metadata={"help": "Number of batches to preload"}
+    )
+    train_subset: str = field(
+        default="train",
+        metadata={"help": "data subset to use for training (e.g. train, valid, test)"},
+    )
+    valid_subset: str = field(
+        default="valid",
+        metadata={
+            "help": "comma separated list of data subsets to use for validation"
+            " (e.g. train, valid, test)"
+        },
+    )
+    combine_valid_subsets: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "comma separated list of data subsets to use for validation"
+            " (e.g. train, valid, test)",
+            "argparse_alias": "--combine-val",
+        },
+    )
+    ignore_unused_valid_subsets: Optional[bool] = field(
+        default=False,
+        metadata={"help": "do not raise error if valid subsets are ignored"},
+    )
+
+    validate_interval: int = field(
+        default=1, metadata={"help": "validate every N epochs"}
+    )
+    validate_interval_updates: int = field(
+        default=0, metadata={"help": "validate every N updates"}
+    )
+    validate_after_updates: int = field(
+        default=0, metadata={"help": "dont validate until reaching this many updates"}
+    )
+    fixed_validation_seed: Optional[int] = field(
+        default=None, metadata={"help": "specified random seed for validation"}
+    )
+    disable_validation: bool = field(
+        default=False, metadata={"help": "disable validation"}
+    )
+    max_tokens_valid: Optional[int] = field(
+        default=II("dataset.max_tokens"),
+        metadata={
+            "help": "maximum number of tokens in a validation batch"
+            " (defaults to --max-tokens)"
+        },
+    )
+    batch_size_valid: Optional[int] = field(
+        default=II("dataset.batch_size"),
+        metadata={
+            "help": "batch size of the validation batch (defaults to --batch-size)",
+            "argparse_alias": "--max-sentences-valid",
+        },
+    )
+    max_valid_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "How many batches to evaluate", "argparse_alias": "--nval"},
+    )
+    curriculum: int = field(
+        default=0, metadata={"help": "don't shuffle batches for first N epochs"}
+    )
+    gen_subset: str = field(
+        default="test",
+        metadata={"help": "data subset to generate (train, valid, test)"},
+    )
+    num_shards: int = field(
+        default=1, metadata={"help": "shard generation over N shards"}
+    )
+    shard_id: int = field(
+        default=0, metadata={"help": "id of the shard to generate (id < num_shards)"}
+    )
+    grouped_shuffling: bool = field(
+        default=False,
+        metadata={
+            "help": "shuffle batches in groups of num_shards to enable similar sequence lengths on each GPU worker when batches are sorted by length",
+        },
+    )
+    update_epoch_batch_itr: bool = field(
+        default=II("dataset.grouped_shuffling"),
+        metadata={
+            "help": "if true then prevents the reuse the epoch batch iterator by setting can_reuse_epoch_itr to false, defaults to --grouped-shuffling )",
+        },
+    )
+    update_ordered_indices_seed: bool = field(
+        default=False,
+        metadata={
+            "help": "if true then increment seed with epoch for getting batch iterators, defautls to False.",
+        },
+    )
+
+
+@dataclass
+class OptimizationConfig(FairseqDataclass):
+    max_epoch: int = field(
+        default=0, metadata={"help": "force stop training at specified epoch"}
+    )
+    max_update: int = field(
+        default=0, metadata={"help": "force stop training at specified update"}
+    )
+    stop_time_hours: float = field(
+        default=0,
+        metadata={
+            "help": "force stop training after specified cumulative time (if >0)"
+        },
+    )
+    clip_norm: float = field(
+        default=0.0, metadata={"help": "clip threshold of gradients"}
+    )
+    sentence_avg: bool = field(
+        default=False,
+        metadata={
+            "help": "normalize gradients by the number of sentences in a batch"
+            " (default is to normalize by number of tokens)"
+        },
+    )
+    update_freq: List[int] = field(
+        default_factory=lambda: [1],
+        metadata={"help": "update parameters every N_i batches, when in epoch i"},
+    )
+    lr: List[float] = field(
+        default_factory=lambda: [0.25],
+        metadata={
+            "help": "learning rate for the first N epochs; all epochs >N using LR_N"
+            " (note: this may be interpreted differently depending on --lr-scheduler)"
+        },
+    )
+    stop_min_lr: float = field(
+        default=-1.0,
+        metadata={"help": "stop training when the learning rate reaches this minimum"},
+    )
+    use_bmuf: bool = field(
+        default=False,
+        metadata={
+            "help": "specify global optimizer for syncing models on different GPUs/shards"
+        },
+    )
+    skip_remainder_batch: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "if set, include the last (partial) batch of each epoch in training"
+            " (default is to skip it)."
+        },
+    )
+    debug_param_names: bool = False
+
+
+@dataclass
+class CheckpointConfig(FairseqDataclass):
+    save_dir: str = field(
+        default="checkpoints", metadata={"help": "path to save checkpoints"}
+    )
+    restore_file: str = field(
+        default="checkpoint_last.pt",
+        metadata={
+            "help": "filename from which to load checkpoint "
+            "(default: <save-dir>/checkpoint_last.pt"
+        },
+    )
+    continue_once: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "continues from this checkpoint, unless a checkpoint indicated in 'restore_file' option is present"
+        },
+    )
+    finetune_from_model: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "finetune from a pretrained model; note that meters and lr scheduler will be reset"
+        },
+    )
+    reset_dataloader: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, does not reload dataloader state from the checkpoint"
+        },
+    )
+    reset_lr_scheduler: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, does not load lr scheduler state from the checkpoint"
+        },
+    )
+    reset_meters: bool = field(
+        default=False,
+        metadata={"help": "if set, does not load meters from the checkpoint"},
+    )
+    reset_optimizer: bool = field(
+        default=False,
+        metadata={"help": "if set, does not load optimizer state from the checkpoint"},
+    )
+    optimizer_overrides: str = field(
+        default="{}",
+        metadata={
+            "help": "a dictionary used to override optimizer args when loading a checkpoint"
+        },
+    )
+    save_interval: int = field(
+        default=1, metadata={"help": "save a checkpoint every N epochs"}
+    )
+    save_interval_updates: int = field(
+        default=0, metadata={"help": "save a checkpoint (and validate) every N updates"}
+    )
+    keep_interval_updates: int = field(
+        default=-1,
+        metadata={
+            "help": "keep the last N checkpoints saved with --save-interval-updates"
+        },
+    )
+    keep_interval_updates_pattern: int = field(
+        default=-1,
+        metadata={
+            "help": "when used with --keep-interval-updates, skips deleting "
+            "any checkpoints with update X where "
+            "X %% keep_interval_updates_pattern == 0"
+        },
+    )
+    keep_last_epochs: int = field(
+        default=-1, metadata={"help": "keep last N epoch checkpoints"}
+    )
+    keep_best_checkpoints: int = field(
+        default=-1, metadata={"help": "keep best N checkpoints based on scores"}
+    )
+    no_save: bool = field(
+        default=False, metadata={"help": "don't save models or checkpoints"}
+    )
+    no_epoch_checkpoints: bool = field(
+        default=False, metadata={"help": "only store last and best checkpoints"}
+    )
+    no_last_checkpoints: bool = field(
+        default=False, metadata={"help": "don't store last checkpoints"}
+    )
+    no_save_optimizer_state: bool = field(
+        default=False,
+        metadata={"help": "don't save optimizer-state as part of checkpoint"},
+    )
+    best_checkpoint_metric: str = field(
+        default="loss", metadata={"help": 'metric to use for saving "best" checkpoints'}
+    )
+    maximize_best_checkpoint_metric: bool = field(
+        default=False,
+        metadata={
+            "help": 'select the largest metric value for saving "best" checkpoints'
+        },
+    )
+    patience: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "early stop training if valid performance doesn't "
+                "improve for N consecutive validation runs; note "
+                "that this is influenced by --validate-interval"
+            )
+        },
+    )
+    checkpoint_suffix: str = field(
+        default="", metadata={"help": "suffix to add to the checkpoint file name"}
+    )
+    checkpoint_shard_count: int = field(
+        default=1,
+        metadata={
+            "help": "Number of shards containing the checkpoint - "
+            "if the checkpoint is over 300GB, it is preferable "
+            "to split it into shards to prevent OOM on CPU while loading "
+            "the checkpoint"
+        },
+    )
+    load_checkpoint_on_all_dp_ranks: bool = field(
+        default=False,
+        metadata={
+            "help": "load checkpoints on all data parallel devices "
+            "(default: only load on rank 0 and broadcast to other devices)"
+        },
+    )
+    write_checkpoints_asynchronously: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Write checkpoints asynchronously in a separate "
+                "thread. NOTE: This feature is currently being tested."
+            ),
+            "argparse_alias": "--save-async",
+        },
+    )
+    model_parallel_size: int = II("common.model_parallel_size")
+
+
+@dataclass
+class FairseqBMUFConfig(FairseqDataclass):
+    block_lr: float = field(
+        default=1, metadata={"help": "block learning rate for bmuf"}
+    )
+    block_momentum: float = field(
+        default=0.875, metadata={"help": "block momentum for bmuf"}
+    )
+    global_sync_iter: int = field(
+        default=50, metadata={"help": "Iteration for syncing global model"}
+    )
+    warmup_iterations: int = field(
+        default=500, metadata={"help": "warmup iterations for model to broadcast"}
+    )
+    use_nbm: bool = field(
+        default=False,
+        metadata={"help": "Specify whether you want to use classical BM / Nesterov BM"},
+    )
+    average_sync: bool = field(
+        default=False,
+        metadata={
+            "help": "Specify whether you want to average the local momentum after each sync"
+        },
+    )
+    distributed_world_size: int = II("distributed_training.distributed_world_size")
+
+
+@dataclass
+class GenerationConfig(FairseqDataclass):
+    beam: int = field(
+        default=5,
+        metadata={"help": "beam size"},
+    )
+    beam_mt: int = field(
+        default=0,
+        metadata={"help": "beam size for the first-pass decoder"},
+    )
+    nbest: int = field(
+        default=1,
+        metadata={"help": "number of hypotheses to output"},
+    )
+    max_len_a: float = field(
+        default=0,
+        metadata={
+            "help": "generate sequences of maximum length ax + b, where x is the source length"
+        },
+    )
+    max_len_b: int = field(
+        default=200,
+        metadata={
+            "help": "generate sequences of maximum length ax + b, where x is the source length"
+        },
+    )
+    max_len_a_mt: float = field(
+        default=0,
+        metadata={
+            "help": "generate sequences of maximum length ax + b, where x is the source length for the first-pass decoder"
+        },
+    )
+    max_len_b_mt: int = field(
+        default=200,
+        metadata={
+            "help": "generate sequences of maximum length ax + b, where x is the source length for the first-pass decoder"
+        },
+    )
+    min_len: int = field(
+        default=1,
+        metadata={"help": "minimum generation length"},
+    )
+    match_source_len: bool = field(
+        default=False,
+        metadata={"help": "generations should match the source length"},
+    )
+    unnormalized: bool = field(
+        default=False,
+        metadata={"help": "compare unnormalized hypothesis scores"},
+    )
+    no_early_stop: bool = field(
+        default=False,
+        metadata={"help": "deprecated"},
+    )
+    no_beamable_mm: bool = field(
+        default=False,
+        metadata={"help": "don't use BeamableMM in attention layers"},
+    )
+    lenpen: float = field(
+        default=1,
+        metadata={
+            "help": "length penalty: <1.0 favors shorter, >1.0 favors longer sentences"
+        },
+    )
+    lenpen_mt: float = field(
+        default=1,
+        metadata={
+            "help": "length penalty for the first-pass decoder: <1.0 favors shorter, >1.0 favors longer sentences"
+        },
+    )
+    unkpen: float = field(
+        default=0,
+        metadata={
+            "help": "unknown word penalty: <0 produces more unks, >0 produces fewer"
+        },
+    )
+    replace_unk: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "perform unknown replacement (optionally with alignment dictionary)",
+            "argparse_const": "@@ ",
+        },
+    )
+    sacrebleu: bool = field(
+        default=False,
+        metadata={"help": "score with sacrebleu"},
+    )
+    score_reference: bool = field(
+        default=False,
+        metadata={"help": "just score the reference translation"},
+    )
+    prefix_size: int = field(
+        default=0,
+        metadata={"help": "initialize generation by target prefix of given length"},
+    )
+    no_repeat_ngram_size: int = field(
+        default=0,
+        metadata={
+            "help": "ngram blocking such that this size ngram cannot be repeated in the generation"
+        },
+    )
+    sampling: bool = field(
+        default=False,
+        metadata={"help": "sample hypotheses instead of using beam search"},
+    )
+    sampling_topk: int = field(
+        default=-1,
+        metadata={"help": "sample from top K likely next words instead of all words"},
+    )
+    sampling_topp: float = field(
+        default=-1.0,
+        metadata={
+            "help": "sample from the smallest set whose cumulative probability mass exceeds p for next words"
+        },
+    )
+    constraints: Optional[GENERATION_CONSTRAINTS_CHOICES] = field(
+        default=None,
+        metadata={
+            "help": "enables lexically constrained decoding",
+            "argparse_const": "ordered",
+        },
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={"help": "temperature for generation"},
+    )
+    diverse_beam_groups: int = field(
+        default=-1,
+        metadata={"help": "number of groups for Diverse Beam Search"},
+    )
+    diverse_beam_strength: float = field(
+        default=0.5,
+        metadata={"help": "strength of diversity penalty for Diverse Beam Search"},
+    )
+    diversity_rate: float = field(
+        default=-1.0,
+        metadata={"help": "strength of diversity penalty for Diverse Siblings Search"},
+    )
+    print_alignment: Optional[PRINT_ALIGNMENT_CHOICES] = field(
+        default=None,
+        metadata={
+            "help": "if set, uses attention feedback to compute and print alignment to source tokens "
+            "(valid options are: hard, soft, otherwise treated as hard alignment)",
+            "argparse_const": "hard",
+        },
+    )
+    print_step: bool = field(
+        default=False,
+        metadata={"help": "print steps"},
+    )
+    lm_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "path to lm checkpoint for lm fusion"},
+    )
+    lm_weight: float = field(
+        default=0.0,
+        metadata={"help": "weight for lm probs for lm fusion"},
+    )
+
+    # arguments for iterative refinement generator
+    iter_decode_eos_penalty: float = field(
+        default=0.0,
+        metadata={"help": "if > 0.0, it penalized early-stopping in decoding."},
+    )
+    iter_decode_max_iter: int = field(
+        default=10,
+        metadata={"help": "maximum iterations for iterative refinement."},
+    )
+    iter_decode_force_max_iter: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, run exact the maximum number of iterations without early stop"
+        },
+    )
+    iter_decode_with_beam: int = field(
+        default=1,
+        metadata={
+            "help": "if > 1, model will generate translations varying by the lengths."
+        },
+    )
+    iter_decode_with_external_reranker: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, the last checkpoint are assumed to be a reranker to rescore the translations"
+        },
+    )
+    retain_iter_history: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, decoding returns the whole history of iterative refinement"
+        },
+    )
+    retain_dropout: bool = field(
+        default=False,
+        metadata={"help": "Use dropout at inference time"},
+    )
+    # temporarily set to Any until https://github.com/facebookresearch/hydra/issues/1117 is fixed
+    # retain_dropout_modules: Optional[List[str]] = field(
+    retain_dropout_modules: Any = field(
+        default=None,
+        metadata={
+            "help": "if set, only retain dropout for the specified modules; "
+            "if not set, then dropout will be retained for all modules"
+        },
+    )
+    # special decoding format for advanced decoding.
+    decoding_format: Optional[GENERATION_DECODING_FORMAT_CHOICES] = field(
+        default=None,
+        metadata={"help": "special decoding format for advanced decoding."},
+    )
+    no_seed_provided: bool = field(
+        default=False,
+        metadata={"help": "if set, dont use seed for initializing random generators"},
+    )
+    eos_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "EOS token"},
+    )
+
+
+@dataclass
+class CommonEvalConfig(FairseqDataclass):
+    path: Optional[str] = field(
+        default=None,
+        metadata={"help": "path(s) to model file(s), colon separated"},
+    )
+    post_process: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "post-process text by removing BPE, letter segmentation, etc. "
+                "Valid options can be found in fairseq.data.utils.post_process."
+            ),
+            "argparse_const": "subword_nmt",
+            "argparse_alias": "--remove-bpe",
+        },
+    )
+    quiet: bool = field(default=False, metadata={"help": "only print final scores"})
+    model_overrides: str = field(
+        default="{}",
+        metadata={
+            "help": "a dictionary used to override model args at generation that were used during model training"
+        },
+    )
+    results_path: Optional[str] = field(
+        default=None, metadata={"help": "path to save eval results (optional)"}
+    )
+
+
+@dataclass
+class EvalLMConfig(FairseqDataclass):
+    output_word_probs: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, outputs words and their predicted log probabilities to standard output"
+        },
+    )
+    output_word_stats: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, outputs word statistics such as word count, average probability, etc"
+        },
+    )
+    context_window: int = field(
+        default=0,
+        metadata={
+            "help": "ensures that every evaluated token has access to a context of at least this size, if possible"
+        },
+    )
+    softmax_batch: int = field(
+        default=sys.maxsize,
+        metadata={
+            "help": "if BxT is more than this, will batch the softmax over vocab to this amount of tokens, in order to fit into GPU memory"
+        },
+    )
+
+
+@dataclass
+class InteractiveConfig(FairseqDataclass):
+    buffer_size: int = field(
+        default=0,
+        metadata={
+            "help": "read this many sentences into a buffer before processing them"
+        },
+    )
+    input: str = field(
+        default="-",
+        metadata={"help": "file to read from; use - for stdin"},
+    )
+
+
+@dataclass
+class EMAConfig(FairseqDataclass):
+    store_ema: bool = field(
+        default=False, metadata={help: "store exponential moving average shadow model"}
+    )
+    ema_decay: float = field(
+        default=0.9999, metadata={"help": "decay for exponential moving average model"}
+    )
+    ema_start_update: int = field(
+        default=0, metadata={"help": "start EMA update after this many model updates"}
+    )
+    ema_seed_model: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Seed to load EMA model from. "
+            "Used to load EMA model separately from the actual model."
+        },
+    )
+    ema_update_freq: int = field(
+        default=1, metadata={"help": "Do EMA update every this many model updates"}
+    )
+    ema_fp32: bool = field(
+        default=False,
+        metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"},
+    )
+
+
+@dataclass
+class FairseqConfig(FairseqDataclass):
+    common: CommonConfig = CommonConfig()
+    common_eval: CommonEvalConfig = CommonEvalConfig()
+    distributed_training: DistributedTrainingConfig = DistributedTrainingConfig()
+    dataset: DatasetConfig = DatasetConfig()
+    optimization: OptimizationConfig = OptimizationConfig()
+    checkpoint: CheckpointConfig = CheckpointConfig()
+    bmuf: FairseqBMUFConfig = FairseqBMUFConfig()
+    generation: GenerationConfig = GenerationConfig()
+    eval_lm: EvalLMConfig = EvalLMConfig()
+    interactive: InteractiveConfig = InteractiveConfig()
+    model: Any = MISSING
+    task: Any = None
+    criterion: Any = None
+    optimizer: Any = None
+    lr_scheduler: Any = None
+    scoring: Any = None
+    bpe: Any = None
+    tokenizer: Any = None
+    ema: EMAConfig = EMAConfig()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
new file mode 100644
index 0000000000..5af92f2b3a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum, EnumMeta
+from typing import List
+
+
+class StrEnumMeta(EnumMeta):
+    # this is workaround for submitit pickling leading to instance checks failing in hydra for StrEnum, see
+    # https://github.com/facebookresearch/hydra/issues/1156
+    @classmethod
+    def __instancecheck__(cls, other):
+        return "enum" in str(type(other))
+
+
+class StrEnum(Enum, metaclass=StrEnumMeta):
+    def __str__(self):
+        return self.value
+
+    def __eq__(self, other: str):
+        return self.value == other
+
+    def __repr__(self):
+        return self.value
+
+    def __hash__(self):
+        return hash(str(self))
+
+
+def ChoiceEnum(choices: List[str]):
+    """return the Enum class used to enforce list of choices"""
+    return StrEnum("Choices", {k: k for k in choices})
+
+
+LOG_FORMAT_CHOICES = ChoiceEnum(["json", "none", "simple", "tqdm"])
+DDP_BACKEND_CHOICES = ChoiceEnum(
+    [
+        "c10d",  # alias for pytorch_ddp
+        "fully_sharded",  # FullyShardedDataParallel from fairscale
+        "legacy_ddp",
+        "no_c10d",  # alias for legacy_ddp
+        "pytorch_ddp",
+        "slowmo",
+    ]
+)
+DDP_COMM_HOOK_CHOICES = ChoiceEnum(["none", "fp16"])
+DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta", "huffman"])
+GENERATION_CONSTRAINTS_CHOICES = ChoiceEnum(["ordered", "unordered"])
+GENERATION_DECODING_FORMAT_CHOICES = ChoiceEnum(
+    ["unigram", "ensemble", "vote", "dp", "bs"]
+)
+ZERO_SHARDING_CHOICES = ChoiceEnum(["none", "os"])
+PIPELINE_CHECKPOINT_CHOICES = ChoiceEnum(["always", "never", "except_last"])
+PRINT_ALIGNMENT_CHOICES = ChoiceEnum(["hard", "soft"])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
new file mode 100644
index 0000000000..5a7784bad1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import logging
+from hydra.core.config_store import ConfigStore
+from fairseq.dataclass.configs import FairseqConfig
+from omegaconf import DictConfig, OmegaConf
+
+
+logger = logging.getLogger(__name__)
+
+
+def hydra_init(cfg_name="config") -> None:
+
+    cs = ConfigStore.instance()
+    cs.store(name=f"{cfg_name}", node=FairseqConfig)
+
+    for k in FairseqConfig.__dataclass_fields__:
+        v = FairseqConfig.__dataclass_fields__[k].default
+        try:
+            cs.store(name=k, node=v)
+        except BaseException:
+            logger.error(f"{k} - {v}")
+            raise
+
+
+def add_defaults(cfg: DictConfig) -> None:
+    """This function adds default values that are stored in dataclasses that hydra doesn't know about"""
+
+    from fairseq.registry import REGISTRIES
+    from fairseq.tasks import TASK_DATACLASS_REGISTRY
+    from fairseq.models import ARCH_MODEL_NAME_REGISTRY, MODEL_DATACLASS_REGISTRY
+    from fairseq.dataclass.utils import merge_with_parent
+    from typing import Any
+
+    OmegaConf.set_struct(cfg, False)
+
+    for k, v in FairseqConfig.__dataclass_fields__.items():
+        field_cfg = cfg.get(k)
+        if field_cfg is not None and v.type == Any:
+            dc = None
+
+            if isinstance(field_cfg, str):
+                field_cfg = DictConfig({"_name": field_cfg})
+                field_cfg.__dict__["_parent"] = field_cfg.__dict__["_parent"]
+
+            name = getattr(field_cfg, "_name", None)
+
+            if k == "task":
+                dc = TASK_DATACLASS_REGISTRY.get(name)
+            elif k == "model":
+                name = ARCH_MODEL_NAME_REGISTRY.get(name, name)
+                dc = MODEL_DATACLASS_REGISTRY.get(name)
+            elif k in REGISTRIES:
+                dc = REGISTRIES[k]["dataclass_registry"].get(name)
+
+            if dc is not None:
+                cfg[k] = merge_with_parent(dc, field_cfg)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
new file mode 100644
index 0000000000..f6467d5f40
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
@@ -0,0 +1,510 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import ast
+import inspect
+import logging
+import os
+import re
+from argparse import ArgumentError, ArgumentParser, Namespace
+from dataclasses import _MISSING_TYPE, MISSING, is_dataclass
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.configs import FairseqConfig
+from hydra.core.global_hydra import GlobalHydra
+from hydra.experimental import compose, initialize
+from omegaconf import DictConfig, OmegaConf, open_dict, _utils
+
+logger = logging.getLogger(__name__)
+
+
+def eval_str_list(x, x_type=float):
+    if x is None:
+        return None
+    if isinstance(x, str):
+        if len(x) == 0:
+            return []
+        x = ast.literal_eval(x)
+    try:
+        return list(map(x_type, x))
+    except TypeError:
+        return [x_type(x)]
+
+
+def interpret_dc_type(field_type):
+    if isinstance(field_type, str):
+        raise RuntimeError("field should be a type")
+
+    if field_type == Any:
+        return str
+
+    typestring = str(field_type)
+    if re.match(
+        r"(typing.|^)Union\[(.*), NoneType\]$", typestring
+    ) or typestring.startswith("typing.Optional"):
+        return field_type.__args__[0]
+    return field_type
+
+
+def gen_parser_from_dataclass(
+    parser: ArgumentParser,
+    dataclass_instance: FairseqDataclass,
+    delete_default: bool = False,
+    with_prefix: Optional[str] = None,
+) -> None:
+    """
+    convert a dataclass instance to tailing parser arguments.
+
+    If `with_prefix` is provided, prefix all the keys in the resulting parser with it. It means that we are
+    building a flat namespace from a structured dataclass (see transformer_config.py for example).
+    """
+
+    def argparse_name(name: str):
+        if name == "data" and (with_prefix is None or with_prefix == ""):
+            # normally data is positional args, so we don't add the -- nor the prefix
+            return name
+        if name == "_name":
+            # private member, skip
+            return None
+        full_name = "--" + name.replace("_", "-")
+        if with_prefix is not None and with_prefix != "":
+            # if a prefix is specified, construct the prefixed arg name
+            full_name = with_prefix + "-" + full_name[2:]  # strip -- when composing
+        return full_name
+
+    def get_kwargs_from_dc(
+        dataclass_instance: FairseqDataclass, k: str
+    ) -> Dict[str, Any]:
+        """k: dataclass attributes"""
+
+        kwargs = {}
+
+        field_type = dataclass_instance._get_type(k)
+        inter_type = interpret_dc_type(field_type)
+
+        field_default = dataclass_instance._get_default(k)
+
+        if isinstance(inter_type, type) and issubclass(inter_type, Enum):
+            field_choices = [t.value for t in list(inter_type)]
+        else:
+            field_choices = None
+
+        field_help = dataclass_instance._get_help(k)
+        field_const = dataclass_instance._get_argparse_const(k)
+
+        if isinstance(field_default, str) and field_default.startswith("${"):
+            kwargs["default"] = field_default
+        else:
+            if field_default is MISSING:
+                kwargs["required"] = True
+            if field_choices is not None:
+                kwargs["choices"] = field_choices
+            if (
+                isinstance(inter_type, type)
+                and (issubclass(inter_type, List) or issubclass(inter_type, Tuple))
+            ) or ("List" in str(inter_type) or "Tuple" in str(inter_type)):
+                if "int" in str(inter_type):
+                    kwargs["type"] = lambda x: eval_str_list(x, int)
+                elif "float" in str(inter_type):
+                    kwargs["type"] = lambda x: eval_str_list(x, float)
+                elif "str" in str(inter_type):
+                    kwargs["type"] = lambda x: eval_str_list(x, str)
+                else:
+                    raise NotImplementedError(
+                        "parsing of type " + str(inter_type) + " is not implemented"
+                    )
+                if field_default is not MISSING:
+                    kwargs["default"] = (
+                        ",".join(map(str, field_default))
+                        if field_default is not None
+                        else None
+                    )
+            elif (
+                isinstance(inter_type, type) and issubclass(inter_type, Enum)
+            ) or "Enum" in str(inter_type):
+                kwargs["type"] = str
+                if field_default is not MISSING:
+                    if isinstance(field_default, Enum):
+                        kwargs["default"] = field_default.value
+                    else:
+                        kwargs["default"] = field_default
+            elif inter_type is bool:
+                kwargs["action"] = (
+                    "store_false" if field_default is True else "store_true"
+                )
+                kwargs["default"] = field_default
+            else:
+                kwargs["type"] = inter_type
+                if field_default is not MISSING:
+                    kwargs["default"] = field_default
+
+        # build the help with the hierarchical prefix
+        if with_prefix is not None and with_prefix != "" and field_help is not None:
+            field_help = with_prefix[2:] + ": " + field_help
+
+        kwargs["help"] = field_help
+        if field_const is not None:
+            kwargs["const"] = field_const
+            kwargs["nargs"] = "?"
+
+        return kwargs
+
+    for k in dataclass_instance._get_all_attributes():
+        field_name = argparse_name(dataclass_instance._get_name(k))
+        field_type = dataclass_instance._get_type(k)
+        if field_name is None:
+            continue
+        elif inspect.isclass(field_type) and issubclass(field_type, FairseqDataclass):
+            # for fields that are of type FairseqDataclass, we can recursively
+            # add their fields to the namespace (so we add the args from model, task, etc. to the root namespace)
+            prefix = None
+            if with_prefix is not None:
+                # if a prefix is specified, then we don't want to copy the subfields directly to the root namespace
+                # but we prefix them with the name of the current field.
+                prefix = field_name
+            gen_parser_from_dataclass(parser, field_type(), delete_default, prefix)
+            continue
+
+        kwargs = get_kwargs_from_dc(dataclass_instance, k)
+
+        field_args = [field_name]
+        alias = dataclass_instance._get_argparse_alias(k)
+        if alias is not None:
+            field_args.append(alias)
+
+        if "default" in kwargs:
+            if isinstance(kwargs["default"], str) and kwargs["default"].startswith(
+                "${"
+            ):
+                if kwargs["help"] is None:
+                    # this is a field with a name that will be added elsewhere
+                    continue
+                else:
+                    del kwargs["default"]
+            if delete_default and "default" in kwargs:
+                del kwargs["default"]
+        try:
+            parser.add_argument(*field_args, **kwargs)
+        except ArgumentError:
+            pass
+
+
+def _set_legacy_defaults(args, cls):
+    """Helper to set default arguments based on *add_args*."""
+    if not hasattr(cls, "add_args"):
+        return
+
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        argument_default=argparse.SUPPRESS, allow_abbrev=False
+    )
+    cls.add_args(parser)
+    # copied from argparse.py:
+    defaults = argparse.Namespace()
+    for action in parser._actions:
+        if action.dest is not argparse.SUPPRESS:
+            if not hasattr(defaults, action.dest):
+                if action.default is not argparse.SUPPRESS:
+                    setattr(defaults, action.dest, action.default)
+    for key, default_value in vars(defaults).items():
+        if not hasattr(args, key):
+            setattr(args, key, default_value)
+
+
+def _override_attr(
+    sub_node: str, data_class: Type[FairseqDataclass], args: Namespace
+) -> List[str]:
+    overrides = []
+
+    if not inspect.isclass(data_class) or not issubclass(data_class, FairseqDataclass):
+        return overrides
+
+    def get_default(f):
+        if not isinstance(f.default_factory, _MISSING_TYPE):
+            return f.default_factory()
+        return f.default
+
+    for k, v in data_class.__dataclass_fields__.items():
+        if k.startswith("_"):
+            # private member, skip
+            continue
+
+        val = get_default(v) if not hasattr(args, k) else getattr(args, k)
+
+        field_type = interpret_dc_type(v.type)
+        if (
+            isinstance(val, str)
+            and not val.startswith("${")  # not interpolation
+            and field_type != str
+            and (
+                not inspect.isclass(field_type) or not issubclass(field_type, Enum)
+            )  # not choices enum
+        ):
+            # upgrade old models that stored complex parameters as string
+            val = ast.literal_eval(val)
+
+        if isinstance(val, tuple):
+            val = list(val)
+
+        v_type = getattr(v.type, "__origin__", None)
+        if (
+            (v_type is List or v_type is list or v_type is Optional)
+            # skip interpolation
+            and not (isinstance(val, str) and val.startswith("${"))
+        ):
+            # if type is int but val is float, then we will crash later - try to convert here
+            if hasattr(v.type, "__args__"):
+                t_args = v.type.__args__
+                if len(t_args) == 1 and (t_args[0] is float or t_args[0] is int):
+                    val = list(map(t_args[0], val))
+        elif val is not None and (
+            field_type is int or field_type is bool or field_type is float
+        ):
+            try:
+                val = field_type(val)
+            except:
+                pass  # ignore errors here, they are often from interpolation args
+
+        if val is None:
+            overrides.append("{}.{}=null".format(sub_node, k))
+        elif val == "":
+            overrides.append("{}.{}=''".format(sub_node, k))
+        elif isinstance(val, str):
+            val = val.replace("'", r"\'")
+            overrides.append("{}.{}='{}'".format(sub_node, k, val))
+        elif isinstance(val, FairseqDataclass):
+            overrides += _override_attr(f"{sub_node}.{k}", type(val), args)
+        elif isinstance(val, Namespace):
+            sub_overrides, _ = override_module_args(val)
+            for so in sub_overrides:
+                overrides.append(f"{sub_node}.{k}.{so}")
+        else:
+            overrides.append("{}.{}={}".format(sub_node, k, val))
+
+    return overrides
+
+
+def migrate_registry(
+    name, value, registry, args, overrides, deletes, use_name_as_val=False
+):
+    if value in registry:
+        overrides.append("{}={}".format(name, value))
+        overrides.append("{}._name={}".format(name, value))
+        overrides.extend(_override_attr(name, registry[value], args))
+    elif use_name_as_val and value is not None:
+        overrides.append("{}={}".format(name, value))
+    else:
+        deletes.append(name)
+
+
+def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]:
+    """use the field in args to overrides those in cfg"""
+    overrides = []
+    deletes = []
+
+    for k in FairseqConfig.__dataclass_fields__.keys():
+        overrides.extend(
+            _override_attr(k, FairseqConfig.__dataclass_fields__[k].type, args)
+        )
+
+    if args is not None:
+        if hasattr(args, "task"):
+            from fairseq.tasks import TASK_DATACLASS_REGISTRY
+
+            migrate_registry(
+                "task", args.task, TASK_DATACLASS_REGISTRY, args, overrides, deletes
+            )
+        else:
+            deletes.append("task")
+
+        # these options will be set to "None" if they have not yet been migrated
+        # so we can populate them with the entire flat args
+        CORE_REGISTRIES = {"criterion", "optimizer", "lr_scheduler"}
+
+        from fairseq.registry import REGISTRIES
+
+        for k, v in REGISTRIES.items():
+            if hasattr(args, k):
+                migrate_registry(
+                    k,
+                    getattr(args, k),
+                    v["dataclass_registry"],
+                    args,
+                    overrides,
+                    deletes,
+                    use_name_as_val=k not in CORE_REGISTRIES,
+                )
+            else:
+                deletes.append(k)
+
+        no_dc = True
+        if hasattr(args, "arch"):
+            from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_MODEL_NAME_REGISTRY
+
+            if args.arch in ARCH_MODEL_REGISTRY:
+                m_cls = ARCH_MODEL_REGISTRY[args.arch]
+                dc = getattr(m_cls, "__dataclass", None)
+                if dc is not None:
+                    m_name = ARCH_MODEL_NAME_REGISTRY[args.arch]
+                    overrides.append("model={}".format(m_name))
+                    overrides.append("model._name={}".format(args.arch))
+                    # override model params with those exist in args
+                    overrides.extend(_override_attr("model", dc, args))
+                    no_dc = False
+        if no_dc:
+            deletes.append("model")
+
+    return overrides, deletes
+
+
+class omegaconf_no_object_check:
+    def __init__(self):
+        # Changed in https://github.com/omry/omegaconf/pull/911 - both are kept for back compat.
+        if hasattr(_utils, "is_primitive_type"):
+            self.old_is_primitive = _utils.is_primitive_type
+        else:
+            self.old_is_primitive = _utils.is_primitive_type_annotation
+
+    def __enter__(self):
+        if hasattr(_utils, "is_primitive_type"):
+            _utils.is_primitive_type = lambda _: True
+        else:
+            _utils.is_primitive_type_annotation = lambda _: True
+
+    def __exit__(self, type, value, traceback):
+        if hasattr(_utils, "is_primitive_type"):
+            _utils.is_primitive_type = self.old_is_primitive
+        else:
+            _utils.is_primitive_type_annotation = self.old_is_primitive
+
+
+def convert_namespace_to_omegaconf(args: Namespace) -> DictConfig:
+    """Convert a flat argparse.Namespace to a structured DictConfig."""
+
+    # Here we are using field values provided in args to override counterparts inside config object
+    overrides, deletes = override_module_args(args)
+
+    # configs will be in fairseq/config after installation
+    config_path = os.path.join("..", "config")
+
+    GlobalHydra.instance().clear()
+
+    with initialize(config_path=config_path):
+        try:
+            composed_cfg = compose("config", overrides=overrides, strict=False)
+        except:
+            logger.error("Error when composing. Overrides: " + str(overrides))
+            raise
+
+        for k in deletes:
+            composed_cfg[k] = None
+
+    cfg = OmegaConf.create(
+        OmegaConf.to_container(composed_cfg, resolve=True, enum_to_str=True)
+    )
+
+    # hack to be able to set Namespace in dict config. this should be removed when we update to newer
+    # omegaconf version that supports object flags, or when we migrate all existing models
+    from omegaconf import _utils
+
+    with omegaconf_no_object_check():
+        if cfg.task is None and getattr(args, "task", None):
+            cfg.task = Namespace(**vars(args))
+            from fairseq.tasks import TASK_REGISTRY
+
+            _set_legacy_defaults(cfg.task, TASK_REGISTRY[args.task])
+            cfg.task._name = args.task
+        if cfg.model is None and getattr(args, "arch", None):
+            cfg.model = Namespace(**vars(args))
+            from fairseq.models import ARCH_MODEL_REGISTRY
+
+            _set_legacy_defaults(cfg.model, ARCH_MODEL_REGISTRY[args.arch])
+            cfg.model._name = args.arch
+        if cfg.optimizer is None and getattr(args, "optimizer", None):
+            cfg.optimizer = Namespace(**vars(args))
+            from fairseq.optim import OPTIMIZER_REGISTRY
+
+            _set_legacy_defaults(cfg.optimizer, OPTIMIZER_REGISTRY[args.optimizer])
+            cfg.optimizer._name = args.optimizer
+        if cfg.lr_scheduler is None and getattr(args, "lr_scheduler", None):
+            cfg.lr_scheduler = Namespace(**vars(args))
+            from fairseq.optim.lr_scheduler import LR_SCHEDULER_REGISTRY
+
+            _set_legacy_defaults(
+                cfg.lr_scheduler, LR_SCHEDULER_REGISTRY[args.lr_scheduler]
+            )
+            cfg.lr_scheduler._name = args.lr_scheduler
+        if cfg.criterion is None and getattr(args, "criterion", None):
+            cfg.criterion = Namespace(**vars(args))
+            from fairseq.criterions import CRITERION_REGISTRY
+
+            _set_legacy_defaults(cfg.criterion, CRITERION_REGISTRY[args.criterion])
+            cfg.criterion._name = args.criterion
+
+    OmegaConf.set_struct(cfg, True)
+    return cfg
+
+
+def overwrite_args_by_name(cfg: DictConfig, overrides: Dict[str, any]):
+    # this will be deprecated when we get rid of argparse and model_overrides logic
+
+    from fairseq.registry import REGISTRIES
+
+    with open_dict(cfg):
+        for k in cfg.keys():
+            # "k in cfg" will return false if its a "mandatory value (e.g. ???)"
+            if k in cfg and isinstance(cfg[k], DictConfig):
+                if k in overrides and isinstance(overrides[k], dict):
+                    for ok, ov in overrides[k].items():
+                        if isinstance(ov, dict) and cfg[k][ok] is not None:
+                            overwrite_args_by_name(cfg[k][ok], ov)
+                        else:
+                            cfg[k][ok] = ov
+                else:
+                    overwrite_args_by_name(cfg[k], overrides)
+            elif k in cfg and isinstance(cfg[k], Namespace):
+                for override_key, val in overrides.items():
+                    setattr(cfg[k], override_key, val)
+            elif k in overrides:
+                if (
+                    k in REGISTRIES
+                    and overrides[k] in REGISTRIES[k]["dataclass_registry"]
+                ):
+                    cfg[k] = DictConfig(
+                        REGISTRIES[k]["dataclass_registry"][overrides[k]]
+                    )
+                    overwrite_args_by_name(cfg[k], overrides)
+                    cfg[k]._name = overrides[k]
+                else:
+                    cfg[k] = overrides[k]
+
+
+def merge_with_parent(dc: FairseqDataclass, cfg: DictConfig, remove_missing=False):
+    if remove_missing:
+
+        def remove_missing_rec(src_keys, target_cfg):
+            if is_dataclass(target_cfg):
+                target_keys = set(target_cfg.__dataclass_fields__.keys())
+            else:
+                target_keys = set(target_cfg.keys())
+
+            for k in list(src_keys.keys()):
+                if k not in target_keys:
+                    del src_keys[k]
+                elif OmegaConf.is_config(src_keys[k]):
+                    tgt = getattr(target_cfg, k)
+                    if tgt is not None and (is_dataclass(tgt) or hasattr(tgt, "keys")):
+                        remove_missing_rec(src_keys[k], tgt)
+
+        with open_dict(cfg):
+            remove_missing_rec(cfg, dc)
+
+    merged_cfg = OmegaConf.merge(dc, cfg)
+    merged_cfg.__dict__["_parent"] = cfg.__dict__["_parent"]
+    OmegaConf.set_struct(merged_cfg, True)
+    return merged_cfg
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
new file mode 100644
index 0000000000..9130db8f5d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .distributed_timeout_wrapper import DistributedTimeoutWrapper
+from .fully_sharded_data_parallel import (
+    fsdp_enable_wrap,
+    fsdp_wrap,
+    FullyShardedDataParallel,
+)
+from .legacy_distributed_data_parallel import LegacyDistributedDataParallel
+from .module_proxy_wrapper import ModuleProxyWrapper
+from .tpu_distributed_data_parallel import TPUDistributedDataParallel
+
+
+__all__ = [
+    "DistributedTimeoutWrapper",
+    "fsdp_enable_wrap",
+    "fsdp_wrap",
+    "FullyShardedDataParallel",
+    "LegacyDistributedDataParallel",
+    "ModuleProxyWrapper",
+    "TPUDistributedDataParallel",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
new file mode 100644
index 0000000000..6e06b4b6dd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
@@ -0,0 +1,97 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import signal
+import threading
+
+from torch import nn
+
+
+logger = logging.getLogger(__name__)
+
+
+class DistributedTimeoutWrapper(nn.Module):
+    """
+    A wrapper that kills the process if no progress is made within a given
+    *timeout*. The timer is reset every time :func:`forward` is called.
+
+    Usage::
+
+        module = DistributedTimeoutWrapper(module, timeout=30)
+        x = module(input)
+        time.sleep(20)  # safe
+        x = module(input)
+        time.sleep(45)  # job will be killed before this returns
+
+    Args:
+        module (nn.Module): module to wrap
+        timeout (int): number of seconds before killing the process
+            (set to a value <= 0 to disable the timeout)
+        signal (Optional): signal to send once timeout is triggered
+    """
+
+    def __init__(self, module: nn.Module, timeout: int, signal=signal.SIGINT):
+        super().__init__()
+        self.module = module
+        self.timeout = timeout
+        self.signal = signal
+
+        if timeout > 0:
+            self._heartbeat = threading.Event()
+            self._heartbeat_thread = threading.Thread(
+                target=self._check_heartbeat,
+                args=(os.getpid(),),
+                daemon=True,
+            )
+            self._heartbeat_thread.start()
+            self._terminated = False
+        else:
+            self._heartbeat = None
+            self._heartbeat_thread = None
+
+    def __del__(self):
+        self.stop_timeout()
+
+    def __getattr__(self, name):
+        """Forward missing attributes to wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.module, name)
+
+    def stop_timeout(self):
+        if self._heartbeat_thread is not None:
+            self._terminated = True
+            self._heartbeat_thread.join()
+
+    def state_dict(self, *args, **kwargs):
+        return self.module.state_dict(*args, **kwargs)
+
+    def load_state_dict(self, *args, **kwargs):
+        return self.module.load_state_dict(*args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        if self._heartbeat is not None:
+            self._heartbeat.set()
+        return self.module(*args, **kwargs)
+
+    def _check_heartbeat(self, parent_pid):
+        self._heartbeat.wait()  # wait for the first forward pass
+        while True:
+            self._heartbeat.clear()
+            success = self._heartbeat.wait(timeout=self.timeout)
+            if self._terminated:
+                break
+            elif not success:
+                logger.error(
+                    (
+                        "Killing job for not making progress in {} seconds. "
+                        "Set --heartbeat-timeout=-1 to disable this timeout."
+                    ).format(int(self.timeout))
+                )
+                os.kill(parent_pid, self.signal)
+                return
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
new file mode 100644
index 0000000000..1c508b05dd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+from typing import Optional
+
+import torch
+from fairseq.dataclass.configs import DistributedTrainingConfig
+from fairseq.distributed import utils as dist_utils
+
+
+try:
+    from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
+
+    has_FSDP = True
+except ImportError:
+    FSDP = torch.nn.Module
+    has_FSDP = False
+
+
+class FullyShardedDataParallel(FSDP):
+    """
+    A small wrapper around fairscale's FullyShardedDataParallel (FSDP) with some
+    fairseq-specific checkpoint saving/loading logic.
+
+    Args:
+        use_sharded_state (bool): if True, then ``state_dict`` will return
+            ``FSDP.local_state_dict`` and ``load_state_dict`` will call
+            ``FSDP.load_local_state_dict``. Otherwise, ``state_dict`` will
+            return the full model weights on data parallel rank 0 (empty on
+            other ranks) and ``load_state_dict`` will broadcast model weights
+            from rank 0 to other ranks.
+    """
+
+    def __init__(self, *args, use_sharded_state: bool = False, **kwargs):
+        if not has_FSDP:
+            raise ImportError(
+                "Cannot find FullyShardedDataParallel. "
+                "Please install fairscale with: pip install fairscale"
+            )
+        super().__init__(*args, **kwargs)
+        self.use_sharded_state = use_sharded_state
+
+    @property
+    def unwrapped_module(self) -> torch.nn.Module:
+        if self.flatten_parameters:
+            return self.module.module
+        else:
+            return self.module
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        if self.use_sharded_state:
+            return super().local_state_dict(
+                destination=destination, prefix=prefix, keep_vars=keep_vars
+            )
+        else:
+            if self.rank == 0:
+                return super().state_dict(
+                    destination=destination, prefix=prefix, keep_vars=keep_vars
+                )
+            else:
+                # We must call state_dict() due to use of communication
+                # primitives. But we don't use the result.
+                super().state_dict()
+                return destination or {}
+
+    def load_state_dict(self, state_dict, strict=True, model_cfg=None):
+        if self.use_sharded_state:
+            return super().load_local_state_dict(state_dict, strict=strict)
+        else:
+            state_dict = dist_utils.broadcast_object(
+                state_dict, src_rank=0, group=self.process_group
+            )
+            return super().load_state_dict(state_dict, strict=strict)
+
+
+class DummyProcessGroup:
+    def __init__(self, rank: int, size: int):
+        self._rank = rank
+        self._size = size
+
+    def rank(self) -> int:
+        return self._rank
+
+    def size(self) -> int:
+        return self._size
+
+
+@contextlib.contextmanager
+def fsdp_enable_wrap(cfg: DistributedTrainingConfig):
+    try:
+        from fairscale.nn import enable_wrap
+    except ImportError:
+        raise ImportError(
+            "Cannot find FullyShardedDataParallel. "
+            "Please install fairscale with: pip install fairscale"
+        )
+    if cfg.memory_efficient_fp16:
+        assert cfg.fp16  # memory_efficient_fp16 should imply fp16
+    group = dist_utils.get_data_parallel_group()
+    if group is None and cfg.distributed_world_size == 1:
+        group = DummyProcessGroup(rank=0, size=1)
+    fsdp_config = {
+        "process_group": group,
+        "reshard_after_forward": not cfg.no_reshard_after_forward,
+        "mixed_precision": cfg.fp16 and not cfg.memory_efficient_fp16,
+        "fp32_reduce_scatter": cfg.fp32_reduce_scatter,
+        "flatten_parameters": not cfg.not_fsdp_flatten_parameters,
+        "cpu_offload": cfg.cpu_offload,
+        "compute_dtype": torch.float16 if cfg.fp16 else torch.float32,
+        "bucket_cap_mb": cfg.bucket_cap_mb,
+        "state_dict_device": torch.device("cpu"),  # reduce GPU mem usage
+    }
+    with enable_wrap(
+        wrapper_cls=FullyShardedDataParallel,
+        use_sharded_state=cfg.use_sharded_state,
+        **fsdp_config,
+    ):
+        yield
+
+
+def fsdp_wrap(module, min_num_params: Optional[int] = None, **kwargs):
+    """
+    Helper to wrap layers/modules in FSDP. This falls back to a no-op if
+    fairscale is not available.
+
+    Args:
+        module (nn.Module): module to (maybe) wrap
+        min_num_params (int, Optional): minimum number of layer params to wrap
+    """
+    try:
+        from fairscale.nn import wrap
+
+        if min_num_params is not None:
+            num_params = sum(p.numel() for p in module.parameters())
+            if num_params >= min_num_params:
+                return wrap(module, **kwargs)
+            else:
+                return module
+        else:
+            return wrap(module, **kwargs)
+    except ImportError:
+        return module
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
new file mode 100644
index 0000000000..cd434c7372
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
@@ -0,0 +1,165 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+A modified version of the legacy DistributedDataParallel module that uses c10d
+communication primitives. This version is simpler than the latest PyTorch
+version and is useful for debugging. Notably it does not overlap gradient
+communication with the backward pass, which makes it slower but more robust
+than the PyTorch version.
+
+This version also supports the *no_sync* context manager, which allows faster
+training with `--update-freq`.
+"""
+
+from collections import OrderedDict
+from contextlib import contextmanager
+
+import torch
+from torch import nn
+
+from fairseq.distributed import utils
+
+
+class LegacyDistributedDataParallel(nn.Module):
+    """Implements distributed data parallelism at the module level.
+
+    A simplified version of :class:`torch.nn.parallel.DistributedDataParallel`.
+    This version uses a c10d process group for communication and does not
+    broadcast buffers.
+
+    Args:
+        module (~torch.nn.Module): module to be parallelized
+        process_group: the c10d process group to be used for distributed data
+            parallel all-reduction.
+        buffer_size (int, optional): number of elements to buffer before
+            performing all-reduce (default: 256M).
+    """
+
+    def __init__(self, module, process_group, buffer_size=2**28):
+        super().__init__()
+
+        self.module = module
+        self.process_group = process_group
+        self.world_size = utils.get_world_size(self.process_group)
+
+        # Never use a bigger buffer than the number of model params
+        self.buffer_size = min(buffer_size, sum(p.numel() for p in module.parameters()))
+        self.buffer = None
+
+        # We can also forcibly accumulate grads locally and only do the
+        # all-reduce at some later time
+        self.accumulate_grads = False
+
+        # make per-device lists of parameters
+        paramlists = OrderedDict()
+        for param in self.module.parameters():
+            device = param.device
+            if paramlists.get(device) is None:
+                paramlists[device] = []
+            paramlists[device] += [param]
+        self.per_device_params = list(paramlists.values())
+
+    @contextmanager
+    def no_sync(self):
+        """A context manager to disable gradient synchronization."""
+        old_accumulate_grads = self.accumulate_grads
+        self.accumulate_grads = True
+        yield
+        self.accumulate_grads = old_accumulate_grads
+
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
+
+    def all_reduce_grads(self):
+        """
+        This function must be called explicitly after backward to reduce
+        gradients. There is no automatic hook like c10d.
+        """
+
+        def all_reduce_params(params):
+            buffer = self.buffer
+            nonzero_buffer = False
+            if len(params) > 1:
+                offset = 0
+                for p in params:
+                    sz = p.numel()
+                    if p.grad is not None:
+                        buffer[offset : offset + sz].copy_(p.grad.data.view(-1))
+                        nonzero_buffer = True
+                    else:
+                        buffer[offset : offset + sz].zero_()
+                    offset += sz
+            else:
+                # we only have a single grad to all-reduce
+                p = params[0]
+                if p.grad is not None:
+                    buffer = p.grad.data
+                    nonzero_buffer = True
+                elif p.numel() <= self.buffer.numel():
+                    buffer = buffer[: p.numel()]
+                    buffer.zero_()
+                else:
+                    buffer = torch.zeros_like(p)
+
+            if nonzero_buffer:
+                buffer.div_(self.world_size)
+
+            utils.all_reduce(buffer, self.process_group)
+
+            # copy all-reduced grads back into their original place
+            offset = 0
+            for p in params:
+                sz = p.numel()
+                if p.grad is not None:
+                    p.grad.data.copy_(buffer[offset : offset + sz].view_as(p))
+                else:
+                    p.grad = buffer[offset : offset + sz].view_as(p).clone()
+                offset += sz
+
+        def reduction_fn():
+            # This function only needs to be called once
+            if self.accumulate_grads:
+                return
+
+            if self.buffer is None:
+                self.buffer = next(self.module.parameters()).new(self.buffer_size)
+
+            for params in self.per_device_params:
+                # All-reduce the gradients in buckets
+                offset = 0
+                buffered_params = []
+                for param in params:
+                    if not param.requires_grad:
+                        continue
+                    if param.grad is None:
+                        param.grad = torch.zeros_like(param)
+
+                    if hasattr(param, "expert"):
+                        # Skip gradient sync for unshared parameters
+                        continue
+
+                    if param.grad.requires_grad:
+                        raise RuntimeError(
+                            "DistributedDataParallel only works "
+                            "with gradients that don't require "
+                            "grad"
+                        )
+                    sz = param.numel()
+                    if sz > self.buffer.numel():
+                        # all-reduce big params directly
+                        all_reduce_params([param])
+                    else:
+                        if offset + sz > self.buffer.numel():
+                            all_reduce_params(buffered_params)
+                            offset = 0
+                            buffered_params.clear()
+                        buffered_params.append(param)
+                        offset += sz
+
+                if len(buffered_params) > 0:
+                    all_reduce_params(buffered_params)
+
+        reduction_fn()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
new file mode 100644
index 0000000000..904dc0c202
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import nn
+
+
+class ModuleProxyWrapper(nn.Module):
+    """
+    Wrap a DistributedDataParallel module and forward requests for missing
+    attributes to the module wrapped by DDP (the twice-wrapped module).
+    Also forward calls to :func:`state_dict` and :func:`load_state_dict`.
+
+    Usage::
+
+        module.xyz = "hello world"
+        wrapped_module = DistributedDataParallel(module, **ddp_args)
+        wrapped_module = ModuleProxyWrapper(wrapped_module)
+        assert wrapped_module.xyz == "hello world"
+        assert wrapped_module.state_dict().keys() == module.state_dict().keys()
+
+    Args:
+        module (nn.Module): module to wrap
+    """
+
+    def __init__(self, module: nn.Module):
+        super().__init__()
+        assert hasattr(
+            module, "module"
+        ), "ModuleProxyWrapper expects input to wrap another module"
+        self.module = module
+
+    def __getattr__(self, name):
+        """Forward missing attributes to twice-wrapped module."""
+        try:
+            # defer to nn.Module's logic
+            return super().__getattr__(name)
+        except AttributeError:
+            try:
+                # forward to the once-wrapped module
+                return getattr(self.module, name)
+            except AttributeError:
+                # forward to the twice-wrapped module
+                return getattr(self.module.module, name)
+
+    def state_dict(self, *args, **kwargs):
+        """Forward to the twice-wrapped module."""
+        return self.module.module.state_dict(*args, **kwargs)
+
+    def load_state_dict(self, *args, **kwargs):
+        """Forward to the twice-wrapped module."""
+        return self.module.module.load_state_dict(*args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
new file mode 100644
index 0000000000..3b9e103301
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+
+from fairseq.distributed import utils
+
+
+class TPUDistributedDataParallel(nn.Module):
+    def __init__(self, module, process_group):
+        super().__init__()
+        self.module = module
+        self.process_group = process_group
+        self.world_size = utils.get_world_size(self.process_group)
+
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
+
+    def all_reduce_grads(self):
+        gradients = []
+        for p in self.parameters():
+            if not p.requires_grad:
+                continue
+            if p.grad is None:
+                p.grad = torch.zeros_like(p)
+            if p.grad.requires_grad:
+                raise RuntimeError(
+                    "TPUDistributedDataParallel only works with gradients that don't "
+                    "require grad"
+                )
+            gradients.append(p.grad)
+
+        import torch_xla.core.xla_model as xm
+
+        xm.all_reduce(
+            "sum",
+            gradients,
+            scale=1.0 / self.world_size,
+            groups=self.process_group[1],
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
new file mode 100644
index 0000000000..968830d585
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
@@ -0,0 +1,843 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import io
+import logging
+import os
+import pickle
+import random
+import socket
+import struct
+import subprocess
+import warnings
+from argparse import Namespace
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Mapping, Optional
+
+import torch
+import torch.distributed as dist
+from fairseq.dataclass.configs import DistributedTrainingConfig, FairseqConfig
+from omegaconf import open_dict
+
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    xm = None
+
+
+# Flag to indicate if we're using Megatron
+# NOTE: this is a temporary hack until we move away from Megatron's model parallel init
+_USE_MEGATRON = False
+
+# Whether to use XLA ops (e.g., on TPUs) instead of CUDA ops.
+_USE_XLA = False
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_master(cfg: DistributedTrainingConfig):
+    return cfg.distributed_rank == 0
+
+
+def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False):
+    if cfg.distributed_init_method is not None or cfg.tpu:
+        return
+
+    num_pipelines_per_node = None
+    if cfg.pipeline_model_parallel:
+        num_pipeline_devices, num_pipelines_per_node = _pipeline_parallel_pre_init(cfg)
+
+    if cfg.distributed_world_size == 1:
+        return
+    if all(
+        key in os.environ
+        for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]
+    ):
+        # support torch.distributed.launch
+        _infer_torch_distributed_launch_init(cfg)
+    else:
+        # we can determine the init method automatically for Slurm
+        if not _infer_slurm_init(cfg, num_pipelines_per_node):
+            if cfg.distributed_port <= 0 or force_distributed:
+                _infer_single_node_init(cfg)
+        elif cfg.distributed_port <= 0:
+            _infer_single_node_init(cfg)
+
+    if cfg.pipeline_model_parallel:
+        _pipeline_parallel_post_init(cfg, num_pipeline_devices, num_pipelines_per_node)
+    elif not cfg.distributed_no_spawn:
+        with open_dict(cfg):
+            cfg.distributed_num_procs = min(
+                torch.cuda.device_count(), cfg.distributed_world_size
+            )
+    else:
+        if cfg.device_id > 0:
+            logger.info(
+                "setting CUDA device={} on rank {}".format(
+                    cfg.device_id, cfg.distributed_rank
+                )
+            )
+            torch.cuda.set_device(cfg.device_id)
+
+
+def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig):
+    cfg.distributed_init_method = "env://"
+    cfg.distributed_world_size = int(os.environ["WORLD_SIZE"])
+    cfg.distributed_rank = int(os.environ["RANK"])
+    cfg.device_id = cfg.distributed_rank % torch.cuda.device_count()
+    # processes are created by torch.distributed.launch
+    cfg.distributed_no_spawn = True
+
+
+def _infer_slurm_init(cfg: DistributedTrainingConfig, num_pipelines_per_node):
+    node_list = os.environ.get("SLURM_STEP_NODELIST")
+    if node_list is None:
+        node_list = os.environ.get("SLURM_JOB_NODELIST")
+    if node_list is not None:
+        try:
+            hostnames = subprocess.check_output(
+                ["scontrol", "show", "hostnames", node_list]
+            )
+            cfg.distributed_init_method = "tcp://{host}:{port}".format(
+                host=hostnames.split()[0].decode("utf-8"),
+                port=cfg.distributed_port,
+            )
+            nnodes = int(os.environ.get("SLURM_NNODES"))
+            ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE")
+            if ntasks_per_node is not None:
+                ntasks_per_node = int(ntasks_per_node)
+            else:
+                ntasks = int(os.environ.get("SLURM_NTASKS"))
+                nnodes = int(os.environ.get("SLURM_NNODES"))
+                assert ntasks % nnodes == 0
+                ntasks_per_node = int(ntasks / nnodes)
+            if ntasks_per_node == 1:
+                gpus_per_node = torch.cuda.device_count()
+                node_id = int(os.environ.get("SLURM_NODEID"))
+                cfg.distributed_rank = node_id * gpus_per_node
+                cfg.distributed_world_size = nnodes * gpus_per_node
+            elif cfg.pipeline_model_parallel:
+                assert ntasks_per_node == num_pipelines_per_node, (
+                    "SLURM --ntasks-per-node must match number of pipelines per "
+                    "node (={})".format(num_pipelines_per_node)
+                )
+                cfg.distributed_no_spawn = True
+                # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
+                # the first node, [1, 2] on the second node, etc. This
+                # matches torch.distributed.launch.
+                node_id = int(os.environ.get("SLURM_NODEID"))
+                local_id = int(os.environ.get("SLURM_LOCALID"))
+                cfg.distributed_rank = node_id * num_pipelines_per_node + local_id
+                # In the above example, device_id will always be in [0, 1],
+                # which also matches torch.distributed.launch.
+                cfg.device_id = local_id
+                # We also want to set distributed_world_size to be the total
+                # number of pipelines across all nodes.
+                cfg.distributed_world_size = nnodes * num_pipelines_per_node
+            else:
+                assert (
+                    ntasks_per_node == cfg.distributed_world_size // nnodes
+                ), f"{ntasks_per_node}, {cfg.distributed_world_size}, {nnodes}"
+                cfg.distributed_no_spawn = True
+                cfg.distributed_rank = int(os.environ.get("SLURM_PROCID"))
+                cfg.device_id = int(os.environ.get("SLURM_LOCALID"))
+            logger.info(f"Rank {cfg.distributed_rank}, device_id: {cfg.device_id}")
+            return True
+        except subprocess.CalledProcessError as e:  # scontrol failed
+            raise e
+        except FileNotFoundError:  # Slurm is not installed
+            pass
+
+    return False
+
+
+def _infer_single_node_init(cfg: DistributedTrainingConfig):
+    assert (
+        cfg.distributed_world_size <= torch.cuda.device_count()
+    ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices"
+
+    if cfg.distributed_port <= 0:
+        jobid = os.environ.get("SLURM_JOB_ID")
+        task_id = os.environ.get("SLURM_ARRAY_TASK_ID")
+
+        if jobid is not None:
+            if task_id is not None:
+                jobid += str(task_id)
+            jobid = int(jobid)
+            rng = random.Random(jobid)
+            port = rng.randint(10000, 60000)
+        else:
+            port = random.randint(10000, 60000)
+
+        cfg.distributed_port = port
+    cfg.distributed_init_method = "tcp://localhost:{port}".format(
+        port=cfg.distributed_port
+    )
+
+
+def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig):
+    from fairseq import utils
+
+    balance_exists = (
+        cfg.pipeline_balance is not None
+        or cfg.pipeline_encoder_balance is not None
+        or cfg.pipeline_decoder_balance is not None
+    )
+    devices_exist = (
+        cfg.pipeline_devices is not None
+        or cfg.pipeline_encoder_devices is not None
+        or cfg.pipeline_decoder_devices is not None
+    )
+    if not balance_exists:
+        raise ValueError(
+            "--pipeline-balance is currently required for pipeline model parallelism"
+        )
+    if not devices_exist:
+        raise ValueError(
+            "--pipeline-devices is currently required for pipeline model parallelism"
+        )
+
+    cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int)
+    if cfg.pipeline_devices is not None:
+        cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int)
+        num_pipeline_devices = len(set(cfg.pipeline_devices))
+    else:
+        cfg.pipeline_encoder_devices = utils.eval_str_list(
+            cfg.pipeline_encoder_devices, type=int
+        )
+        cfg.pipeline_decoder_devices = utils.eval_str_list(
+            cfg.pipeline_decoder_devices, type=int
+        )
+        num_pipeline_devices = len(
+            set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices)
+        )
+    gpus_per_node = torch.cuda.device_count()
+    assert (
+        gpus_per_node >= num_pipeline_devices
+        and gpus_per_node % num_pipeline_devices == 0
+    ), (
+        "the number of unique device IDs in --pipeline-devices must evenly divide "
+        "the number of GPUs per node (multi-node pipelining is not yet supported)"
+    )
+    num_pipelines_per_node = gpus_per_node // num_pipeline_devices
+    return num_pipeline_devices, num_pipelines_per_node
+
+
+def _pipeline_parallel_post_init(
+    cfg: DistributedTrainingConfig, num_pipeline_devices, num_pipelines_per_node
+):
+    if not cfg.distributed_no_spawn:
+        # When distributed_no_spawn is False, we expect distributed_rank and
+        # distributed_world_size to be based on the total number of GPUs, so
+        # we need to correct them to be based on the number of pipelines.
+        assert cfg.distributed_world_size % num_pipeline_devices == 0
+        cfg.distributed_world_size = cfg.distributed_world_size // num_pipeline_devices
+        # In the case of 4-way MP on nodes with 8 GPUs, we want
+        # distributed_rank to be the starting GPU index for each pipeline
+        # i.e., 0, 2, ...
+        gpus_per_node = torch.cuda.device_count()
+        assert cfg.distributed_rank % gpus_per_node == 0
+        assert cfg.distributed_rank % num_pipeline_devices == 0
+
+        with open_dict(cfg):
+            cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices
+            # launch one process per pipeline
+            cfg.distributed_num_procs = num_pipelines_per_node
+
+    # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
+    # and 4, indicating the starting device IDs for each pipeline
+    cfg.device_id *= num_pipeline_devices
+
+    if cfg.device_id > 0:
+        # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
+        # GPU node), we need to adjust pipeline_devices accordingly
+        logger.debug(
+            "setting CUDA device={} on rank {}".format(
+                cfg.device_id, cfg.distributed_rank
+            )
+        )
+        torch.cuda.set_device(cfg.device_id)
+        with open_dict(cfg):
+            cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices]
+        logger.info(
+            "setting pipeline_devices={} on rank {}".format(
+                cfg.pipeline_devices, cfg.distributed_rank
+            )
+        )
+
+
+def distributed_init(cfg: FairseqConfig):
+    if isinstance(cfg, Namespace):
+        from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    if not cfg.common.tpu:
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            warnings.warn(
+                "Distributed is already initialized, cannot initialize twice!"
+            )
+        else:
+            logger.info(
+                "distributed init (rank {}): {}".format(
+                    cfg.distributed_training.distributed_rank,
+                    cfg.distributed_training.distributed_init_method,
+                )
+            )
+            dist.init_process_group(
+                backend=cfg.distributed_training.distributed_backend,
+                init_method=cfg.distributed_training.distributed_init_method,
+                world_size=cfg.distributed_training.distributed_world_size,
+                rank=cfg.distributed_training.distributed_rank,
+            )
+            logger.info(
+                "initialized host {} as rank {}".format(
+                    socket.gethostname(),
+                    cfg.distributed_training.distributed_rank,
+                )
+            )
+
+            # perform a dummy all-reduce to initialize the NCCL communicator
+            if torch.cuda.is_available():
+                dist.all_reduce(torch.zeros(1).cuda())
+
+        cfg.distributed_training.distributed_rank = torch.distributed.get_rank()
+    else:
+        assert xm.xrt_world_size() == cfg.distributed_training.distributed_world_size
+        global _USE_XLA
+        _USE_XLA = True
+        cfg.distributed_training.device_id = xm.get_local_ordinal()
+        cfg.distributed_training.distributed_rank = xm.get_ordinal()
+        xm.rendezvous("distributed_init")  # wait for all workers
+
+    if is_master(cfg.distributed_training):
+        logging.getLogger().setLevel(logging.INFO)
+    else:
+        logging.getLogger().setLevel(logging.WARNING)
+
+    if cfg.common.model_parallel_size > 1:
+        try:
+            from fairseq.model_parallel.megatron.mpu import (
+                initialize_model_parallel,
+                model_parallel_cuda_manual_seed,
+            )
+        except ImportError:
+            raise ImportError(
+                "\n\nPlease install the megatron submodule:"
+                "\n\n  git submodule update --init "
+                "fairseq/model_parallel/megatron"
+            )
+        global _USE_MEGATRON
+        _USE_MEGATRON = True
+        initialize_model_parallel(cfg.common.model_parallel_size)
+        model_parallel_cuda_manual_seed(cfg.common.seed)
+        model_part_number = get_model_parallel_rank()
+        cfg.checkpoint.checkpoint_suffix += "-model_part-{0}".format(model_part_number)
+
+    if hasattr(cfg, "model") and getattr(cfg.model, "base_layers", 0) > 0:
+        cfg.checkpoint.checkpoint_suffix = (
+            f"-rank-{cfg.distributed_training.distributed_rank}"
+        )
+
+    return cfg.distributed_training.distributed_rank
+
+
+def distributed_main(i, main, cfg: FairseqConfig, kwargs):
+    cfg.distributed_training.device_id = i
+    if torch.cuda.is_available() and not cfg.common.cpu and not cfg.common.tpu:
+        torch.cuda.set_device(cfg.distributed_training.device_id)
+    if cfg.distributed_training.distributed_rank is None:  # torch.multiprocessing.spawn
+        cfg.distributed_training.distributed_rank = kwargs.pop("start_rank", 0) + i
+
+    cfg.distributed_training.distributed_rank = distributed_init(cfg)
+
+    after_distributed_init_fn = kwargs.pop("after_distributed_init_fn", None)
+    if after_distributed_init_fn:
+        cfg = after_distributed_init_fn(cfg)
+
+    main(cfg, **kwargs)
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier(get_global_group())
+
+
+def call_main(cfg: FairseqConfig, main, **kwargs):
+    if cfg.distributed_training.distributed_init_method is None:
+        infer_init_method(cfg.distributed_training)
+
+    if cfg.distributed_training.distributed_init_method is not None:
+        # distributed training
+        if not cfg.distributed_training.distributed_no_spawn:
+            start_rank = cfg.distributed_training.distributed_rank
+            cfg.distributed_training.distributed_rank = None  # assign automatically
+            kwargs["start_rank"] = start_rank
+
+            torch.multiprocessing.spawn(
+                fn=distributed_main,
+                args=(main, cfg, kwargs),
+                nprocs=min(
+                    torch.cuda.device_count(),
+                    cfg.distributed_training.distributed_world_size,
+                ),
+                join=True,
+            )
+        else:
+            distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
+    elif cfg.common.tpu and cfg.distributed_training.distributed_world_size > 1:
+        import torch_xla.distributed.xla_multiprocessing as xmp
+
+        torch.multiprocessing.set_sharing_strategy("file_system")
+        xmp.spawn(
+            fn=distributed_main,
+            args=(main, cfg, kwargs),
+            # tpu-comment:
+            #   8 devices in one TPU VM, is the max processes to be spawned.
+            #   The rest is driven by xm.distributed.xla_dist
+            nprocs=min(cfg.distributed_training.distributed_world_size, 8),
+        )
+    else:
+        # single GPU main
+        main(cfg, **kwargs)
+
+
+def use_xla():
+    global _USE_XLA
+    return _USE_XLA
+
+
+def new_groups(grouped_ranks: List[List[int]]):
+    if use_xla():
+        return ("tpu", grouped_ranks)
+    else:
+        groups = [dist.new_group(g) for g in grouped_ranks]
+        my_group_idx = _find_my_group_index(grouped_ranks)
+        return groups[my_group_idx]
+
+
+def _find_my_group_index(grouped_ranks):
+    my_rank = get_global_rank()
+    for i, group in enumerate(grouped_ranks):
+        if my_rank in group:
+            return i
+    raise RuntimeError
+
+
+def _find_my_group(grouped_ranks):
+    index = _find_my_group_index(grouped_ranks)
+    return grouped_ranks[index]
+
+
+def get_rank(group):
+    if use_xla():
+        assert group[0] == "tpu"
+        my_group = _find_my_group(group[1])
+        return my_group.index(get_global_rank())
+    else:
+        return dist.get_rank(group=group)
+
+
+def get_world_size(group):
+    if use_xla():
+        assert group[0] == "tpu"
+        my_group = _find_my_group(group[1])
+        return len(my_group)
+    elif torch.distributed.is_initialized():
+        return dist.get_world_size(group=group)
+    else:
+        return 1
+
+
+def get_global_group():
+    if use_xla():
+        return new_groups([list(range(get_global_world_size()))])
+    elif torch.distributed.is_initialized():
+        if not hasattr(get_global_group, "_global_group"):
+            # ideally we could use torch.distributed.group.WORLD, but it seems
+            # to cause random NCCL hangs in some cases
+            get_global_group._global_group = dist.new_group()
+        return get_global_group._global_group
+    else:
+        return None
+
+
+def get_global_rank():
+    if use_xla():
+        return xm.get_ordinal()
+    elif torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    else:
+        return 0
+
+
+def get_global_world_size():
+    if use_xla():
+        return xm.xrt_world_size()
+    elif torch.distributed.is_initialized():
+        return torch.distributed.get_world_size()
+    else:
+        return 1
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    global _USE_MEGATRON
+    if _USE_MEGATRON:
+        from fairseq.model_parallel.megatron import mpu
+
+        return mpu.get_data_parallel_group()
+    else:
+        return get_global_group()
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return get_rank(get_data_parallel_group())
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return get_world_size(get_data_parallel_group())
+
+
+def get_model_parallel_group():
+    global _USE_MEGATRON
+    if _USE_MEGATRON:
+        from fairseq.model_parallel.megatron import mpu
+
+        return mpu.get_model_parallel_group()
+    else:
+        return None
+
+
+def get_model_parallel_rank():
+    """Return my rank for the model parallel group."""
+    return get_rank(get_model_parallel_group())
+
+
+def get_model_parallel_world_size():
+    """Return world size for the model parallel group."""
+    return get_world_size(get_model_parallel_group())
+
+
+def all_reduce(tensor, group, op="sum"):
+    if use_xla():
+        assert isinstance(group, tuple) and group[0] == "tpu"
+        tensor = [tensor]  # wrap in a list to make xm.all_reduce in-place
+        return xm.all_reduce(op, tensor, groups=group[1])[0]
+    else:
+        if op == "sum":
+            op = dist.ReduceOp.SUM
+        elif op == "max":
+            op = dist.ReduceOp.MAX
+        else:
+            raise NotImplementedError
+        dist.all_reduce(tensor, op=op, group=group)
+        return tensor
+
+
+def broadcast(tensor, src, group):
+    if use_xla():
+        # XLA doesn't support broadcast, hack it with all_reduce
+        if get_rank(group) != src:
+            tensor.zero_()
+        all_reduce(tensor, group)
+    else:
+        dist.broadcast(tensor, src=src, group=group)
+
+
+def all_to_all(tensor, group):
+    """Perform an all-to-all operation on a 1D Tensor."""
+    assert tensor.dim() == 1
+    split_count = get_world_size(group=group)
+    assert tensor.numel() % split_count == 0
+    if use_xla():
+        assert isinstance(group, tuple) and group[0] == "tpu"
+        return xm.all_to_all(
+            tensor,
+            split_dimension=0,
+            concat_dimension=0,
+            split_count=split_count,
+            groups=group[1],
+        )
+    else:
+        output = torch.zeros_like(tensor)
+        dist.all_to_all_single(output, tensor, group=group)
+        return output
+
+
+def all_gather(tensor, group, return_tensor=False):
+    """Perform an all-gather operation."""
+    if use_xla():
+        result = xm.all_gather(tensor, groups=group[1])
+        world_size = get_world_size(group=group)
+        result = result.view(world_size, *tensor.size())
+        if return_tensor:
+            return result
+        else:
+            return [result[i] for i in range(world_size)]
+    else:
+        world_size = get_world_size(group=group)
+        rank = get_rank(group=group)
+        tensor_list = [
+            tensor if i == rank else torch.empty_like(tensor) for i in range(world_size)
+        ]
+        dist.all_gather(tensor_list, tensor, group=group)
+        if return_tensor:
+            return torch.stack(tensor_list, dim=0)
+        else:
+            return tensor_list
+
+
+def all_gather_list(data, group=None, max_size=16384):
+    """Gathers arbitrary data from all nodes into a list.
+
+    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
+    data. Note that *data* must be picklable and any CUDA tensors will be moved
+    to CPU and returned on CPU as well.
+
+    Args:
+        data (Any): data from the local worker to be gathered on other workers
+        group: group of the collective
+        max_size (int, optional): maximum size of the data to be gathered
+            across workers
+    """
+    from fairseq import utils
+
+    if group is None:
+        group = get_global_group()
+    rank = get_rank(group=group)
+    world_size = get_world_size(group=group)
+
+    buffer_size = max_size * world_size
+    if (
+        not hasattr(all_gather_list, "_buffer")
+        or all_gather_list._buffer.numel() < buffer_size
+    ):
+        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
+        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
+    buffer = all_gather_list._buffer
+    buffer.zero_()
+    cpu_buffer = all_gather_list._cpu_buffer
+
+    data = utils.move_to_cpu(data)
+    enc = pickle.dumps(data)
+    enc_size = len(enc)
+    header_size = 4  # size of header that contains the length of the encoded data
+    size = header_size + enc_size
+    if size > max_size:
+        raise ValueError(
+            "encoded data size ({}) exceeds max_size ({})".format(size, max_size)
+        )
+
+    header = struct.pack(">I", enc_size)
+    cpu_buffer[:size] = torch.ByteTensor(list(header + enc))
+    start = rank * max_size
+    buffer[start : start + size].copy_(cpu_buffer[:size])
+
+    all_reduce(buffer, group=group)
+
+    buffer = buffer.cpu()
+    try:
+        result = []
+        for i in range(world_size):
+            out_buffer = buffer[i * max_size : (i + 1) * max_size]
+            (enc_size,) = struct.unpack(">I", bytes(out_buffer[:header_size].tolist()))
+            if enc_size > 0:
+                result.append(
+                    pickle.loads(
+                        bytes(out_buffer[header_size : header_size + enc_size].tolist())
+                    )
+                )
+        return result
+    except pickle.UnpicklingError:
+        raise Exception(
+            "Unable to unpickle data from other workers. all_gather_list requires all "
+            "workers to enter the function together, so this error usually indicates "
+            "that the workers have fallen out of sync somehow. Workers can fall out of "
+            "sync if one of them runs out of memory, or if there are other conditions "
+            "in your training script that can cause one worker to finish an epoch "
+            "while other workers are still iterating over their portions of the data. "
+            "Try rerunning with --ddp-backend=legacy_ddp and see if that helps."
+        )
+
+
+def all_reduce_dict(data: Mapping[str, Any], device, group) -> Dict[str, Any]:
+    """
+    AllReduce a dictionary of values across workers. We separately
+    reduce items that are already on the device and items on CPU for
+    better performance.
+
+    Args:
+        data (Mapping[str, Any]): dictionary of data to all-reduce, but
+            cannot be a nested dictionary
+        device (torch.device): device for the reduction
+        group: group of the collective
+    """
+    data_keys = list(data.keys())
+
+    # We want to separately reduce items that are already on the
+    # device and items on CPU for performance reasons.
+    cpu_data = OrderedDict()
+    device_data = OrderedDict()
+    for k in data_keys:
+        t = data[k]
+        if not torch.is_tensor(t):
+            cpu_data[k] = torch.tensor(t, dtype=torch.double)
+        elif t.device.type != device.type:
+            cpu_data[k] = t.to(dtype=torch.double)
+        else:
+            device_data[k] = t.to(dtype=torch.double)
+
+    def _all_reduce_dict(data: OrderedDict):
+        if len(data) == 0:
+            return data
+        buf = torch.cat([t.view(-1) for t in data.values()]).to(device=device)
+        all_reduce(buf, group=group)
+        split_buf = torch.split(buf.clone(), [t.numel() for t in data.values()])
+        reduced_data = [t.view_as(orig) for t, orig in zip(split_buf, data.values())]
+        return OrderedDict(zip(data.keys(), reduced_data))
+
+    cpu_data = _all_reduce_dict(cpu_data)
+    device_data = _all_reduce_dict(device_data)
+
+    def get_from_stack(key):
+        if key in cpu_data:
+            return cpu_data[key]
+        elif key in device_data:
+            return device_data[key]
+        raise KeyError
+
+    return OrderedDict([(key, get_from_stack(key)) for key in data_keys])
+
+
+def broadcast_tensors(
+    tensors: Optional[List[torch.Tensor]],
+    src_rank: int,
+    group: object,
+    dist_device: Optional[torch.device] = None,
+) -> List[torch.Tensor]:
+    """
+    Broadcasts a list of tensors without other (non-src) ranks needing to know
+    the dtypes/shapes of the tensors.
+    """
+    if dist_device is None:
+        if torch.distributed.get_backend(group) == "nccl":
+            dist_device = torch.device("cuda")
+        else:
+            dist_device = torch.device("cpu")
+
+    # share metadata first to simplify transfer
+    is_src_rank = get_rank(group) == src_rank
+    if is_src_rank:
+        metadata = [
+            {"size": t.size(), "dtype": t.dtype, "device": t.device} for t in tensors
+        ]
+        metadata = _broadcast_object_slow(metadata, src_rank, group, dist_device)
+    else:
+        metadata = _broadcast_object_slow(None, src_rank, group, dist_device)
+
+    out_tensors = []
+    for i, meta in enumerate(metadata):
+        if is_src_rank:
+            tensor = tensors[i]
+            broadcast(tensors[i].to(dist_device), src=src_rank, group=group)
+        else:
+            tensor = torch.zeros(
+                [meta["size"].numel()], dtype=meta["dtype"], device=dist_device
+            )
+            broadcast(tensor, src=src_rank, group=group)
+        tensor = tensor.view(meta["size"]).to(meta["device"])
+        out_tensors.append(tensor)
+    return out_tensors
+
+
+def broadcast_object(
+    obj: Any,
+    src_rank: int,
+    group: object,
+    dist_device: Optional[torch.device] = None,
+) -> Any:
+    """Broadcast an arbitrary Python object to other workers."""
+    if dist_device is None:
+        if torch.distributed.get_backend(group) == "nccl":
+            dist_device = torch.device("cuda")
+        else:
+            dist_device = torch.device("cpu")
+
+    if get_rank(group) == src_rank:
+        # split the tensors from the non-tensors so we can broadcast them
+        # directly, avoiding unnecessary serialization/deserialization
+        tensors = []
+        obj = _split_tensors_from_obj(obj, tensors)
+        obj = _broadcast_object_slow(obj, src_rank, group, dist_device)
+        tensors = broadcast_tensors(tensors, src_rank, group, dist_device)
+    else:
+        obj = _broadcast_object_slow(None, src_rank, group, dist_device)
+        tensors = broadcast_tensors(None, src_rank, group, dist_device)
+    return _put_tensors_in_obj(obj, tensors)
+
+
+def _broadcast_object_slow(
+    obj: Any,
+    src_rank: int,
+    group: object,
+    dist_device: torch.device,
+) -> Any:
+    if get_rank(group) == src_rank:
+        # Emit data
+        buffer = io.BytesIO()
+        torch.save(obj, buffer)
+        buffer = torch.ByteTensor(buffer.getbuffer()).to(dist_device)
+        length = torch.LongTensor([len(buffer)]).to(dist_device)
+        broadcast(length, src=src_rank, group=group)
+        broadcast(buffer, src=src_rank, group=group)
+    else:
+        # Fetch from the source
+        length = torch.LongTensor([0]).to(dist_device)
+        broadcast(length, src=src_rank, group=group)
+        buffer = torch.ByteTensor(int(length.item())).to(dist_device)
+        broadcast(buffer, src=src_rank, group=group)
+        buffer = io.BytesIO(buffer.cpu().numpy())
+        obj = torch.load(buffer, map_location="cpu")
+    return obj
+
+
+@dataclass(frozen=True)
+class _TensorPlaceholder:
+    index: int
+
+
+def _split_tensors_from_obj(obj: Any, tensors: List[torch.Tensor]) -> Any:
+    if torch.is_tensor(obj):
+        placeholder = _TensorPlaceholder(index=len(tensors))
+        tensors.append(obj)
+        return placeholder
+    elif isinstance(obj, dict):
+        return {k: _split_tensors_from_obj(v, tensors) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [_split_tensors_from_obj(v, tensors) for v in obj]
+    elif isinstance(obj, tuple):
+        return tuple(_split_tensors_from_obj(v, tensors) for v in obj)
+    elif isinstance(obj, set):
+        return {_split_tensors_from_obj(v, tensors) for v in obj}
+    else:
+        return obj
+
+
+def _put_tensors_in_obj(obj: Any, tensors: List[torch.Tensor]) -> Any:
+    if isinstance(obj, _TensorPlaceholder):
+        return tensors[obj.index]
+    elif isinstance(obj, dict):
+        return {k: _put_tensors_in_obj(v, tensors) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [_put_tensors_in_obj(v, tensors) for v in obj]
+    elif isinstance(obj, tuple):
+        return tuple(_put_tensors_in_obj(v, tensors) for v in obj)
+    elif isinstance(obj, set):
+        return {_put_tensors_in_obj(v, tensors) for v in obj}
+    else:
+        return obj
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
new file mode 100644
index 0000000000..3f27549099
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import typing as tp
+
+
+def _safe_readline(fd) -> str:
+    pos = fd.tell()
+    while True:
+        try:
+            return fd.readline()
+        except UnicodeDecodeError:
+            pos -= 1
+            fd.seek(pos)  # search where this character begins
+
+
+def find_offsets(filename: str, num_chunks: int) -> tp.List[int]:
+    """
+    given a file and a number of chuncks, find the offsets in the file
+    to be able to chunk around full lines.
+    """
+    with open(filename, "r", encoding="utf-8") as f:
+        size = os.fstat(f.fileno()).st_size
+        chunk_size = size // num_chunks
+        offsets = [0 for _ in range(num_chunks + 1)]
+        for i in range(1, num_chunks):
+            f.seek(chunk_size * i)
+            _safe_readline(f)
+            offsets[i] = f.tell()
+        offsets[-1] = size
+        return offsets
+
+
+class ChunkLineIterator:
+    """
+    Iterator to properly iterate over lines of a file chunck.
+    """
+
+    def __init__(self, fd, start_offset: int, end_offset: int):
+        self._fd = fd
+        self._start_offset = start_offset
+        self._end_offset = end_offset
+
+    def __iter__(self) -> tp.Iterable[str]:
+        self._fd.seek(self._start_offset)
+        # next(f) breaks f.tell(), hence readline() must be used
+        line = _safe_readline(self._fd)
+        while line:
+            pos = self._fd.tell()
+            # f.tell() does not always give the byte position in the file
+            # sometimes it skips to a very large number
+            # it is unlikely that through a normal read we go from
+            # end bytes to end + 2**32 bytes (4 GB) and this makes it unlikely
+            # that the procedure breaks by the undeterministic behavior of
+            # f.tell()
+            if (
+                self._end_offset > 0
+                and pos > self._end_offset
+                and pos < self._end_offset + 2**32
+            ):
+                break
+            yield line
+            line = self._fd.readline()
+
+
+class Chunker:
+    """
+    contextmanager to read a chunck of a file line by line.
+    """
+
+    def __init__(self, path: str, start_offset: int, end_offset: int):
+        self.path = path
+        self.start_offset = start_offset
+        self.end_offset = end_offset
+
+    def __enter__(self) -> ChunkLineIterator:
+        self.fd = open(self.path, "r", encoding="utf-8")
+        return ChunkLineIterator(self.fd, self.start_offset, self.end_offset)
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.fd.close()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
new file mode 100644
index 0000000000..8eca70a066
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import shutil
+from typing import List, Optional
+
+
+logger = logging.getLogger(__file__)
+
+
+try:
+    from iopath.common.file_io import g_pathmgr as IOPathManager
+
+    try:
+        # [FB only - for now] AWS PathHandler for PathManager
+        from .fb_pathhandlers import S3PathHandler
+
+        IOPathManager.register_handler(S3PathHandler())
+    except KeyError:
+        logging.warning("S3PathHandler already registered.")
+    except ImportError:
+        logging.debug(
+            "S3PathHandler couldn't be imported. Either missing fb-only files, or boto3 module."
+        )
+
+except ImportError:
+    IOPathManager = None
+
+
+class PathManager:
+    """
+    Wrapper for insulating OSS I/O (using Python builtin operations) from
+    iopath's PathManager abstraction (for transparently handling various
+    internal backends).
+    """
+
+    @staticmethod
+    def open(
+        path: str,
+        mode: str = "r",
+        buffering: int = -1,
+        encoding: Optional[str] = None,
+        errors: Optional[str] = None,
+        newline: Optional[str] = None,
+    ):
+        if IOPathManager:
+            return IOPathManager.open(
+                path=path,
+                mode=mode,
+                buffering=buffering,
+                encoding=encoding,
+                errors=errors,
+                newline=newline,
+            )
+        return open(
+            path,
+            mode=mode,
+            buffering=buffering,
+            encoding=encoding,
+            errors=errors,
+            newline=newline,
+        )
+
+    @staticmethod
+    def copy(src_path: str, dst_path: str, overwrite: bool = False) -> bool:
+        if IOPathManager:
+            return IOPathManager.copy(
+                src_path=src_path, dst_path=dst_path, overwrite=overwrite
+            )
+        return shutil.copyfile(src_path, dst_path)
+
+    @staticmethod
+    def get_local_path(path: str, **kwargs) -> str:
+        if IOPathManager:
+            return IOPathManager.get_local_path(path, **kwargs)
+        return path
+
+    @staticmethod
+    def exists(path: str) -> bool:
+        if IOPathManager:
+            return IOPathManager.exists(path)
+        return os.path.exists(path)
+
+    @staticmethod
+    def isfile(path: str) -> bool:
+        if IOPathManager:
+            return IOPathManager.isfile(path)
+        return os.path.isfile(path)
+
+    @staticmethod
+    def ls(path: str) -> List[str]:
+        if IOPathManager:
+            return IOPathManager.ls(path)
+        return os.listdir(path)
+
+    @staticmethod
+    def mkdirs(path: str) -> None:
+        if IOPathManager:
+            return IOPathManager.mkdirs(path)
+        os.makedirs(path, exist_ok=True)
+
+    @staticmethod
+    def rm(path: str) -> None:
+        if IOPathManager:
+            return IOPathManager.rm(path)
+        os.remove(path)
+
+    @staticmethod
+    def chmod(path: str, mode: int) -> None:
+        if not PathManager.path_requires_pathmanager(path):
+            os.chmod(path, mode)
+
+    @staticmethod
+    def register_handler(handler) -> None:
+        if IOPathManager:
+            return IOPathManager.register_handler(handler=handler)
+
+    @staticmethod
+    def copy_from_local(
+        local_path: str, dst_path: str, overwrite: bool = False, **kwargs
+    ) -> None:
+        if IOPathManager:
+            return IOPathManager.copy_from_local(
+                local_path=local_path, dst_path=dst_path, overwrite=overwrite, **kwargs
+            )
+        return shutil.copyfile(local_path, dst_path)
+
+    @staticmethod
+    def path_requires_pathmanager(path: str) -> bool:
+        """Do we require PathManager to access given path?"""
+        if IOPathManager:
+            for p in IOPathManager._path_handlers.keys():
+                if path.startswith(p):
+                    return True
+        return False
+
+    @staticmethod
+    def supports_rename(path: str) -> bool:
+        # PathManager doesn't yet support renames
+        return not PathManager.path_requires_pathmanager(path)
+
+    @staticmethod
+    def rename(src: str, dst: str):
+        os.rename(src, dst)
+
+    """
+    ioPath async PathManager methods:
+    """
+
+    @staticmethod
+    def opena(
+        path: str,
+        mode: str = "r",
+        buffering: int = -1,
+        encoding: Optional[str] = None,
+        errors: Optional[str] = None,
+        newline: Optional[str] = None,
+    ):
+        """
+        Return file descriptor with asynchronous write operations.
+        """
+        global IOPathManager
+        if not IOPathManager:
+            logging.info("ioPath is initializing PathManager.")
+            try:
+                from iopath.common.file_io import PathManager
+
+                IOPathManager = PathManager()
+            except Exception:
+                logging.exception("Failed to initialize ioPath PathManager object.")
+        return IOPathManager.opena(
+            path=path,
+            mode=mode,
+            buffering=buffering,
+            encoding=encoding,
+            errors=errors,
+            newline=newline,
+        )
+
+    @staticmethod
+    def async_close() -> bool:
+        """
+        Wait for files to be written and clean up asynchronous PathManager.
+        NOTE: `PathManager.async_close()` must be called at the end of any
+        script that uses `PathManager.opena(...)`.
+        """
+        global IOPathManager
+        if IOPathManager:
+            return IOPathManager.async_close()
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
new file mode 100644
index 0000000000..b99da2e8cd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
@@ -0,0 +1,370 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from `AllenNLP <https://github.com/allenai/allennlp>`_.
+and `huggingface <https://github.com/huggingface>`_.
+"""
+
+import fnmatch
+import json
+import logging
+import os
+import shutil
+import tarfile
+import tempfile
+from functools import partial, wraps
+from hashlib import sha256
+from io import open
+
+
+try:
+    from torch.hub import _get_torch_home
+
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv(
+            "TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")
+        )
+    )
+default_cache_path = os.path.join(torch_cache_home, "pytorch_fairseq")
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+
+    PYTORCH_FAIRSEQ_CACHE = Path(os.getenv("PYTORCH_FAIRSEQ_CACHE", default_cache_path))
+except (AttributeError, ImportError):
+    PYTORCH_FAIRSEQ_CACHE = os.getenv("PYTORCH_FAIRSEQ_CACHE", default_cache_path)
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def load_archive_file(archive_file):
+    # redirect to the cache, if necessary
+    try:
+        resolved_archive_file = cached_path(archive_file, cache_dir=None)
+    except EnvironmentError:
+        logger.info(
+            "Archive name '{}' was not found in archive name list. "
+            "We assumed '{}' was a path or URL but couldn't find any file "
+            "associated to this path or URL.".format(
+                archive_file,
+                archive_file,
+            )
+        )
+        return None
+
+    if resolved_archive_file == archive_file:
+        logger.info("loading archive file {}".format(archive_file))
+    else:
+        logger.info(
+            "loading archive file {} from cache at {}".format(
+                archive_file, resolved_archive_file
+            )
+        )
+
+    # Extract archive to temp dir and replace .tar.bz2 if necessary
+    tempdir = None
+    if not os.path.isdir(resolved_archive_file):
+        tempdir = tempfile.mkdtemp()
+        logger.info(
+            "extracting archive file {} to temp dir {}".format(
+                resolved_archive_file, tempdir
+            )
+        )
+        ext = os.path.splitext(archive_file)[1][1:]
+        with tarfile.open(resolved_archive_file, "r:" + ext) as archive:
+            top_dir = os.path.commonprefix(archive.getnames())
+            archive.extractall(tempdir)
+        os.remove(resolved_archive_file)
+        shutil.move(os.path.join(tempdir, top_dir), resolved_archive_file)
+        shutil.rmtree(tempdir)
+
+    return resolved_archive_file
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the URL's, delimited
+    by a period.
+    """
+    url_bytes = url.encode("utf-8")
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        etag_hash = sha256(etag_bytes)
+        filename += "." + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_FAIRSEQ_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + ".json"
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata["url"]
+    etag = metadata["etag"]
+
+    return url, etag
+
+
+def cached_path_from_pm(url_or_filename):
+    """
+    Tries to cache the specified URL using PathManager class.
+    Returns the cached path if success otherwise failure.
+    """
+    try:
+        from fairseq.file_io import PathManager
+
+        local_path = PathManager.get_local_path(url_or_filename)
+        return local_path
+    except Exception:
+        return None
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_FAIRSEQ_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ("http", "https", "s3"):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == "":
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        cached_path = cached_path_from_pm(url_or_filename)
+        if cached_path:
+            return cached_path
+        # Something unknown
+        raise ValueError(
+            "unable to parse {} as a URL or as a local path".format(url_or_filename)
+        )
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        from botocore.exceptions import ClientError
+
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    import boto3
+
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    import boto3
+
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def request_wrap_timeout(func, url):
+    import requests
+
+    for attempt, timeout in enumerate([10, 20, 40, 60, 60]):
+        try:
+            return func(timeout=timeout)
+        except requests.exceptions.Timeout as e:
+            logger.warning(
+                "Request for %s timed-out (attempt %d). Retrying with a timeout of %d secs",
+                url,
+                attempt,
+                timeout,
+                exc_info=e,
+            )
+            continue
+    raise RuntimeError(f"Unable to fetch file {url}")
+
+
+def http_get(url, temp_file):
+    import requests
+    from tqdm import tqdm
+
+    req = request_wrap_timeout(partial(requests.get, url, stream=True), url)
+    content_length = req.headers.get("Content-Length")
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_FAIRSEQ_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        try:
+            import requests
+
+            response = request_wrap_timeout(
+                partial(requests.head, url, allow_redirects=True), url
+            )
+            if response.status_code != 200:
+                etag = None
+            else:
+                etag = response.headers.get("ETag")
+        except RuntimeError:
+            etag = None
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # If we don't have a connection (etag is None) and can't identify the file
+    # try to get the last downloaded one
+    if not os.path.exists(cache_path) and etag is None:
+        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+        matching_files = list(filter(lambda s: not s.endswith(".json"), matching_files))
+        if matching_files:
+            cache_path = os.path.join(cache_dir, matching_files[-1])
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, "wb") as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {"url": url, "etag": etag}
+            meta_path = cache_path + ".json"
+            with open(meta_path, "w") as meta_file:
+                output_string = json.dumps(meta)
+                meta_file.write(output_string)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    """
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    """
+    collection = set()
+    with open(filename, "r", encoding="utf-8") as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
new file mode 100644
index 0000000000..b0c2da15bf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import copy
+import logging
+import os
+from typing import Any, Dict, Iterator, List
+
+import torch
+from omegaconf import open_dict
+from torch import nn
+
+from fairseq import utils
+from fairseq.data import encoders
+
+logger = logging.getLogger(__name__)
+
+
+def from_pretrained(
+    model_name_or_path,
+    checkpoint_file="model.pt",
+    data_name_or_path=".",
+    archive_map=None,
+    **kwargs
+):
+    from fairseq import checkpoint_utils, file_utils
+
+    if archive_map is not None:
+        if model_name_or_path in archive_map:
+            model_name_or_path = archive_map[model_name_or_path]
+        if data_name_or_path is not None and data_name_or_path in archive_map:
+            data_name_or_path = archive_map[data_name_or_path]
+
+        # allow archive_map to set default arg_overrides (e.g., tokenizer, bpe)
+        # for each model
+        if isinstance(model_name_or_path, dict):
+            for k, v in model_name_or_path.items():
+                if k == "checkpoint_file":
+                    checkpoint_file = v
+                elif (
+                    k != "path"
+                    # only set kwargs that don't already have overrides
+                    and k not in kwargs
+                ):
+                    kwargs[k] = v
+            model_name_or_path = model_name_or_path["path"]
+
+    model_path = file_utils.load_archive_file(model_name_or_path)
+
+    # convenience hack for loading data and BPE codes from model archive
+    if data_name_or_path.startswith("."):
+        kwargs["data"] = os.path.abspath(os.path.join(model_path, data_name_or_path))
+    else:
+        kwargs["data"] = file_utils.load_archive_file(data_name_or_path)
+    for file, arg in {
+        "code": "bpe_codes",
+        "bpecodes": "bpe_codes",
+        "sentencepiece.bpe.model": "sentencepiece_model",
+        "merges.txt": "bpe_merges",
+        "vocab.json": "bpe_vocab",
+    }.items():
+        path = os.path.join(model_path, file)
+        if os.path.exists(path):
+            kwargs[arg] = path
+
+    if "user_dir" in kwargs:
+        utils.import_user_module(argparse.Namespace(user_dir=kwargs["user_dir"]))
+
+    model_path = [
+        os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep)
+    ]
+
+    if "is_vocoder" in kwargs:
+        args = {"data": kwargs["data"], "model_path": model_path}
+        task = None
+        models = None
+    else:
+        models, args, task = checkpoint_utils.load_model_ensemble_and_task(
+            model_path,
+            arg_overrides=kwargs,
+        )
+    if "generation_args" in kwargs and kwargs["generation_args"]:
+        for key in kwargs["generation_args"]:
+            setattr(args["generation"], key, kwargs["generation_args"][key])
+
+    return {
+        "args": args,
+        "task": task,
+        "models": models,
+    }
+
+
+class GeneratorHubInterface(nn.Module):
+    """
+    PyTorch Hub interface for generating sequences from a pre-trained
+    translation or language model.
+    """
+
+    def __init__(self, cfg, task, models):
+        super().__init__()
+        self.cfg = cfg
+        self.task = task
+        self.models = nn.ModuleList(models)
+        self.src_dict = task.source_dictionary
+        self.tgt_dict = task.target_dictionary
+
+        # optimize model for generation
+        for model in self.models:
+            model.prepare_for_inference_(cfg)
+
+        # Load alignment dictionary for unknown word replacement
+        # (None if no unknown word replacement, empty if no path to align dictionary)
+        self.align_dict = utils.load_align_dict(cfg.generation.replace_unk)
+
+        self.tokenizer = encoders.build_tokenizer(cfg.tokenizer)
+        self.bpe = encoders.build_bpe(cfg.bpe)
+
+        self.max_positions = utils.resolve_max_positions(
+            self.task.max_positions(), *[model.max_positions() for model in models]
+        )
+
+        # this is useful for determining the device
+        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def translate(
+        self, sentences: List[str], beam: int = 5, verbose: bool = False, **kwargs
+    ) -> List[str]:
+        return self.sample(sentences, beam, verbose, **kwargs)
+
+    def sample(
+        self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs
+    ) -> List[str]:
+        if isinstance(sentences, str):
+            return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0]
+        tokenized_sentences = [self.encode(sentence) for sentence in sentences]
+        batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)
+        return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos]
+
+    def score(
+        self, sentences: List[str], replace_newline_with_eos: bool = False, **kwargs
+    ):
+        if isinstance(sentences, str):
+            return self.score(
+                [sentences], replace_newline_with_eos=replace_newline_with_eos, **kwargs
+            )[0]
+
+        def encode(sentence):
+            if replace_newline_with_eos:
+                return torch.cat([self.encode(line) for line in sentence.splitlines()])
+            else:
+                return self.encode(sentence)
+
+        # NOTE: this doesn't support translation tasks currently
+        tokenized_sentences = [encode(sentence) for sentence in sentences]
+        return [
+            hypos[0]
+            for hypos in self.generate(
+                tokenized_sentences, score_reference=True, **kwargs
+            )
+        ]
+
+    def generate(
+        self,
+        tokenized_sentences: List[torch.LongTensor],
+        beam: int = 5,
+        verbose: bool = False,
+        skip_invalid_size_inputs=False,
+        inference_step_args=None,
+        prefix_allowed_tokens_fn=None,
+        **kwargs
+    ) -> List[List[Dict[str, torch.Tensor]]]:
+        if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1:
+            return self.generate(
+                tokenized_sentences.unsqueeze(0), beam=beam, verbose=verbose, **kwargs
+            )[0]
+
+        # build generator using current args as well as any kwargs
+        gen_args = copy.deepcopy(self.cfg.generation)
+        with open_dict(gen_args):
+            gen_args.beam = beam
+            for k, v in kwargs.items():
+                setattr(gen_args, k, v)
+        generator = self.task.build_generator(
+            self.models,
+            gen_args,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+        )
+
+        inference_step_args = inference_step_args or {}
+        results = []
+        for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs):
+            batch = utils.apply_to_sample(lambda t: t.to(self.device), batch)
+            translations = self.task.inference_step(
+                generator, self.models, batch, **inference_step_args
+            )
+            for id, hypos in zip(batch["id"].tolist(), translations):
+                results.append((id, hypos))
+
+        # sort output to match input order
+        outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])]
+
+        if verbose:
+
+            def getarg(name, default):
+                return getattr(gen_args, name, getattr(self.cfg, name, default))
+
+            for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs):
+                src_str_with_unk = self.string(source_tokens)
+                logger.info("S\t{}".format(src_str_with_unk))
+                for hypo in target_hypotheses:
+                    hypo_str = self.decode(hypo["tokens"])
+                    logger.info("H\t{}\t{}".format(hypo["score"], hypo_str))
+                    logger.info(
+                        "P\t{}".format(
+                            " ".join(
+                                map(
+                                    lambda x: "{:.4f}".format(x),
+                                    hypo["positional_scores"].tolist(),
+                                )
+                            )
+                        )
+                    )
+                    if hypo["alignment"] is not None and getarg(
+                        "print_alignment", False
+                    ):
+                        logger.info(
+                            "A\t{}".format(
+                                " ".join(
+                                    [
+                                        "{}-{}".format(src_idx, tgt_idx)
+                                        for src_idx, tgt_idx in hypo["alignment"]
+                                    ]
+                                )
+                            )
+                        )
+        return outputs
+
+    def encode(self, sentence: str) -> torch.LongTensor:
+        sentence = self.tokenize(sentence)
+        sentence = self.apply_bpe(sentence)
+        return self.binarize(sentence)
+
+    def decode(self, tokens: torch.LongTensor) -> str:
+        sentence = self.string(tokens)
+        sentence = self.remove_bpe(sentence)
+        return self.detokenize(sentence)
+
+    def tokenize(self, sentence: str) -> str:
+        if self.tokenizer is not None:
+            sentence = self.tokenizer.encode(sentence)
+        return sentence
+
+    def detokenize(self, sentence: str) -> str:
+        if self.tokenizer is not None:
+            sentence = self.tokenizer.decode(sentence)
+        return sentence
+
+    def apply_bpe(self, sentence: str) -> str:
+        if self.bpe is not None:
+            sentence = self.bpe.encode(sentence)
+        return sentence
+
+    def remove_bpe(self, sentence: str) -> str:
+        if self.bpe is not None:
+            sentence = self.bpe.decode(sentence)
+        return sentence
+
+    def binarize(self, sentence: str) -> torch.LongTensor:
+        return self.src_dict.encode_line(sentence, add_if_not_exist=False).long()
+
+    def string(self, tokens: torch.LongTensor) -> str:
+        return self.tgt_dict.string(tokens)
+
+    def _build_batches(
+        self, tokens: List[List[int]], skip_invalid_size_inputs: bool
+    ) -> Iterator[Dict[str, Any]]:
+        lengths = torch.LongTensor([t.numel() for t in tokens])
+        batch_iterator = self.task.get_batch_iterator(
+            dataset=self.task.build_dataset_for_inference(tokens, lengths),
+            max_tokens=self.cfg.dataset.max_tokens,
+            max_sentences=self.cfg.dataset.batch_size,
+            max_positions=self.max_positions,
+            ignore_invalid_inputs=skip_invalid_size_inputs,
+            disable_iterator_cache=True,
+        ).next_epoch_itr(shuffle=False)
+        return batch_iterator
+
+
+class BPEHubInterface(object):
+    """PyTorch Hub interface for Byte-Pair Encoding (BPE)."""
+
+    def __init__(self, bpe, **kwargs):
+        super().__init__()
+        args = argparse.Namespace(bpe=bpe, **kwargs)
+        self.bpe = encoders.build_bpe(args)
+        assert self.bpe is not None
+
+    def encode(self, sentence: str) -> str:
+        return self.bpe.encode(sentence)
+
+    def decode(self, sentence: str) -> str:
+        return self.bpe.decode(sentence)
+
+
+class TokenizerHubInterface(object):
+    """PyTorch Hub interface for tokenization."""
+
+    def __init__(self, tokenizer, **kwargs):
+        super().__init__()
+        args = argparse.Namespace(tokenizer=tokenizer, **kwargs)
+        self.tokenizer = encoders.build_tokenizer(args)
+        assert self.tokenizer is not None
+
+    def encode(self, sentence: str) -> str:
+        return self.tokenizer.encode(sentence)
+
+    def decode(self, sentence: str) -> str:
+        return self.tokenizer.decode(sentence)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
new file mode 100644
index 0000000000..b26e6cd01c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import uuid
+from typing import Dict, Optional
+
+from torch import Tensor
+
+
+class FairseqIncrementalState(object):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_incremental_state()
+
+    def init_incremental_state(self):
+        self._incremental_state_id = str(uuid.uuid4())
+
+    def _get_full_incremental_state_key(self, key: str) -> str:
+        return "{}.{}".format(self._incremental_state_id, key)
+
+    def get_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        key: str,
+    ) -> Optional[Dict[str, Optional[Tensor]]]:
+        """Helper for getting incremental state for an nn.Module."""
+        full_key = self._get_full_incremental_state_key(key)
+        if incremental_state is None or full_key not in incremental_state:
+            return None
+        return incremental_state[full_key]
+
+    def set_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        key: str,
+        value: Dict[str, Optional[Tensor]],
+    ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+        """Helper for setting incremental state for an nn.Module."""
+        if incremental_state is not None:
+            full_key = self._get_full_incremental_state_key(key)
+            incremental_state[full_key] = value
+        return incremental_state
+
+
+def with_incremental_state(cls):
+    cls.__bases__ = (FairseqIncrementalState,) + tuple(
+        b for b in cls.__bases__ if b != FairseqIncrementalState
+    )
+    return cls
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
new file mode 100644
index 0000000000..3d32c6bf4d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
@@ -0,0 +1,359 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import namedtuple
+
+import numpy as np
+import torch
+from fairseq import utils
+
+
+DecoderOut = namedtuple(
+    "IterativeRefinementDecoderOut",
+    ["output_tokens", "output_scores", "attn", "step", "max_step", "history"],
+)
+
+
+class IterativeRefinementGenerator(object):
+    def __init__(
+        self,
+        tgt_dict,
+        models=None,
+        eos_penalty=0.0,
+        max_iter=10,
+        max_ratio=2,
+        beam_size=1,
+        decoding_format=None,
+        retain_dropout=False,
+        adaptive=True,
+        retain_history=False,
+        reranking=False,
+    ):
+        """
+        Generates translations based on iterative refinement.
+
+        Args:
+            tgt_dict: target dictionary
+            eos_penalty: if > 0.0, it penalized early-stopping in decoding
+            max_iter: maximum number of refinement iterations
+            max_ratio: generate sequences of maximum length ax, where x is the source length
+            decoding_format: decoding mode in {'unigram', 'ensemble', 'vote', 'dp', 'bs'}
+            retain_dropout: retaining dropout in the inference
+            adaptive: decoding with early stop
+        """
+        self.bos = tgt_dict.bos()
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.eos_penalty = eos_penalty
+        self.max_iter = max_iter
+        self.max_ratio = max_ratio
+        self.beam_size = beam_size
+        self.reranking = reranking
+        self.decoding_format = decoding_format
+        self.retain_dropout = retain_dropout
+        self.retain_history = retain_history
+        self.adaptive = adaptive
+        self.models = models
+
+    def generate_batched_itr(
+        self,
+        data_itr,
+        maxlen_a=None,
+        maxlen_b=None,
+        cuda=False,
+        timer=None,
+        prefix_size=0,
+    ):
+        """Iterate over a batched dataset and yield individual translations.
+
+        Args:
+            maxlen_a/b: generate sequences of maximum length ax + b,
+                where x is the source sentence length.
+            cuda: use GPU for generation
+            timer: StopwatchMeter for timing generations.
+        """
+
+        for sample in data_itr:
+            if "net_input" not in sample:
+                continue
+            if timer is not None:
+                timer.start()
+            with torch.no_grad():
+                hypos = self.generate(
+                    self.models,
+                    sample,
+                    prefix_tokens=sample["target"][:, :prefix_size]
+                    if prefix_size > 0
+                    else None,
+                )
+            if timer is not None:
+                timer.stop(sample["ntokens"])
+            for i, id in enumerate(sample["id"]):
+                # remove padding
+                src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad)
+                ref = utils.strip_pad(sample["target"][i, :], self.pad)
+                yield id, src, ref, hypos[i]
+
+    @torch.no_grad()
+    def generate(self, models, sample, prefix_tokens=None, constraints=None):
+        if constraints is not None:
+            raise NotImplementedError(
+                "Constrained decoding with the IterativeRefinementGenerator is not supported"
+            )
+
+        # TODO: iterative refinement generator does not support ensemble for now.
+        if not self.retain_dropout:
+            for model in models:
+                model.eval()
+
+        model, reranker = models[0], None
+        if self.reranking:
+            assert len(models) > 1, "Assuming the last checkpoint is the reranker"
+            assert (
+                self.beam_size > 1
+            ), "Reranking requires multiple translation for each example"
+
+            reranker = models[-1]
+            models = models[:-1]
+
+        if len(models) > 1 and hasattr(model, "enable_ensemble"):
+            assert model.allow_ensemble, "{} does not support ensembling".format(
+                model.__class__.__name__
+            )
+            model.enable_ensemble(models)
+
+        # TODO: better encoder inputs?
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lengths = sample["net_input"]["src_lengths"]
+        bsz, src_len = src_tokens.size()
+
+        # initialize
+        encoder_out = model.forward_encoder([src_tokens, src_lengths])
+        prev_decoder_out = model.initialize_output_tokens(encoder_out, src_tokens)
+
+        if self.beam_size > 1:
+            assert (
+                model.allow_length_beam
+            ), "{} does not support decoding with length beam.".format(
+                model.__class__.__name__
+            )
+
+            # regenerate data based on length-beam
+            length_beam_order = (
+                utils.new_arange(src_tokens, self.beam_size, bsz).t().reshape(-1)
+            )
+            encoder_out = model.encoder.reorder_encoder_out(
+                encoder_out, length_beam_order
+            )
+            prev_decoder_out = model.regenerate_length_beam(
+                prev_decoder_out, self.beam_size
+            )
+            bsz = bsz * self.beam_size
+
+        sent_idxs = torch.arange(bsz)
+        prev_output_tokens = prev_decoder_out.output_tokens.clone()
+
+        if self.retain_history:
+            prev_decoder_out = prev_decoder_out._replace(history=[prev_output_tokens])
+
+        finalized = [[] for _ in range(bsz)]
+
+        def is_a_loop(x, y, s, a):
+            b, l_x, l_y = x.size(0), x.size(1), y.size(1)
+            if l_x > l_y:
+                y = torch.cat([y, x.new_zeros(b, l_x - l_y).fill_(self.pad)], 1)
+                s = torch.cat([s, s.new_zeros(b, l_x - l_y)], 1)
+                if a is not None:
+                    a = torch.cat([a, a.new_zeros(b, l_x - l_y, a.size(2))], 1)
+            elif l_x < l_y:
+                x = torch.cat([x, y.new_zeros(b, l_y - l_x).fill_(self.pad)], 1)
+            return (x == y).all(1), y, s, a
+
+        def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
+            cutoff = prev_out_token.ne(self.pad)
+            tokens = prev_out_token[cutoff]
+            if prev_out_score is None:
+                scores, score = None, None
+            else:
+                scores = prev_out_score[cutoff]
+                score = scores.mean()
+
+            if prev_out_attn is None:
+                hypo_attn, alignment = None, None
+            else:
+                hypo_attn = prev_out_attn[cutoff]
+                alignment = hypo_attn.max(dim=1)[1]
+            return {
+                "steps": step,
+                "tokens": tokens,
+                "positional_scores": scores,
+                "score": score,
+                "hypo_attn": hypo_attn,
+                "alignment": alignment,
+            }
+
+        for step in range(self.max_iter + 1):
+
+            decoder_options = {
+                "eos_penalty": self.eos_penalty,
+                "max_ratio": self.max_ratio,
+                "decoding_format": self.decoding_format,
+            }
+            prev_decoder_out = prev_decoder_out._replace(
+                step=step,
+                max_step=self.max_iter + 1,
+            )
+
+            decoder_out = model.forward_decoder(
+                prev_decoder_out, encoder_out, **decoder_options
+            )
+
+            if self.adaptive:
+                # terminate if there is a loop
+                terminated, out_tokens, out_scores, out_attn = is_a_loop(
+                    prev_output_tokens,
+                    decoder_out.output_tokens,
+                    decoder_out.output_scores,
+                    decoder_out.attn,
+                )
+                decoder_out = decoder_out._replace(
+                    output_tokens=out_tokens,
+                    output_scores=out_scores,
+                    attn=out_attn,
+                )
+
+            else:
+                terminated = decoder_out.output_tokens.new_zeros(
+                    decoder_out.output_tokens.size(0)
+                ).bool()
+
+            if step == self.max_iter:  # reach last iteration, terminate
+                terminated.fill_(1)
+
+            # collect finalized sentences
+            finalized_idxs = sent_idxs[terminated.to(sent_idxs.device)]
+            finalized_tokens = decoder_out.output_tokens[terminated]
+            finalized_scores = decoder_out.output_scores[terminated]
+            finalized_attn = (
+                None
+                if (decoder_out.attn is None or decoder_out.attn.size(0) == 0)
+                else decoder_out.attn[terminated]
+            )
+
+            if self.retain_history:
+                finalized_history_tokens = [h[terminated] for h in decoder_out.history]
+
+            for i in range(finalized_idxs.size(0)):
+                finalized[finalized_idxs[i]] = [
+                    finalized_hypos(
+                        step,
+                        finalized_tokens[i],
+                        finalized_scores[i],
+                        None if finalized_attn is None else finalized_attn[i],
+                    )
+                ]
+
+                if self.retain_history:
+                    finalized[finalized_idxs[i]][0]["history"] = []
+                    for j in range(len(finalized_history_tokens)):
+                        finalized[finalized_idxs[i]][0]["history"].append(
+                            finalized_hypos(
+                                step, finalized_history_tokens[j][i], None, None
+                            )
+                        )
+
+            # check if all terminated
+            if terminated.sum() == terminated.size(0):
+                break
+
+            # for next step
+            not_terminated = ~terminated
+            prev_decoder_out = decoder_out._replace(
+                output_tokens=decoder_out.output_tokens[not_terminated],
+                output_scores=decoder_out.output_scores[not_terminated],
+                attn=decoder_out.attn[not_terminated]
+                if (decoder_out.attn is not None and decoder_out.attn.size(0) > 0)
+                else None,
+                history=[h[not_terminated] for h in decoder_out.history]
+                if decoder_out.history is not None
+                else None,
+            )
+            encoder_out = model.encoder.reorder_encoder_out(
+                encoder_out, not_terminated.nonzero(as_tuple=False).squeeze()
+            )
+            sent_idxs = sent_idxs[not_terminated.to(sent_idxs.device)]
+            prev_output_tokens = prev_decoder_out.output_tokens.clone()
+
+        if self.beam_size > 1:
+            if reranker is not None:
+                finalized = self.rerank(
+                    reranker, finalized, [src_tokens, src_lengths], self.beam_size
+                )
+
+            # aggregate information from length beam
+            finalized = [
+                finalized[
+                    np.argmax(
+                        [
+                            finalized[self.beam_size * i + j][0]["score"]
+                            for j in range(self.beam_size)
+                        ]
+                    )
+                    + self.beam_size * i
+                ]
+                for i in range(len(finalized) // self.beam_size)
+            ]
+
+        return finalized
+
+    def rerank(self, reranker, finalized, encoder_input, beam_size):
+        def rebuild_batch(finalized):
+            finalized_tokens = [f[0]["tokens"] for f in finalized]
+            finalized_maxlen = max(f.size(0) for f in finalized_tokens)
+            final_output_tokens = (
+                finalized_tokens[0]
+                .new_zeros(len(finalized_tokens), finalized_maxlen)
+                .fill_(self.pad)
+            )
+            for i, f in enumerate(finalized_tokens):
+                final_output_tokens[i, : f.size(0)] = f
+            return final_output_tokens
+
+        final_output_tokens = rebuild_batch(finalized)
+        final_output_tokens[
+            :, 0
+        ] = self.eos  # autoregressive model assumes starting with EOS
+
+        reranker_encoder_out = reranker.encoder(*encoder_input)
+        length_beam_order = (
+            utils.new_arange(
+                final_output_tokens, beam_size, reranker_encoder_out.encoder_out.size(1)
+            )
+            .t()
+            .reshape(-1)
+        )
+        reranker_encoder_out = reranker.encoder.reorder_encoder_out(
+            reranker_encoder_out, length_beam_order
+        )
+        reranking_scores = reranker.get_normalized_probs(
+            reranker.decoder(final_output_tokens[:, :-1], reranker_encoder_out),
+            True,
+            None,
+        )
+        reranking_scores = reranking_scores.gather(2, final_output_tokens[:, 1:, None])
+        reranking_masks = final_output_tokens[:, 1:].ne(self.pad)
+        reranking_scores = (
+            reranking_scores[:, :, 0].masked_fill_(~reranking_masks, 0).sum(1)
+        )
+        reranking_scores = reranking_scores / reranking_masks.sum(1).type_as(
+            reranking_scores
+        )
+
+        for i in range(len(finalized)):
+            finalized[i][0]["score"] = reranking_scores[i]
+
+        return finalized
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
new file mode 100644
index 0000000000..495bd08300
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
@@ -0,0 +1,351 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import bisect
+import time
+from collections import OrderedDict
+from typing import Dict, Optional
+
+try:
+    import torch
+
+    def type_as(a, b):
+        if torch.is_tensor(a) and torch.is_tensor(b):
+            return a.to(b)
+        else:
+            return a
+
+except ImportError:
+    torch = None
+
+    def type_as(a, b):
+        return a
+
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+
+class Meter(object):
+    """Base class for Meters."""
+
+    def __init__(self):
+        pass
+
+    def state_dict(self):
+        return {}
+
+    def load_state_dict(self, state_dict):
+        pass
+
+    def reset(self):
+        raise NotImplementedError
+
+    @property
+    def smoothed_value(self) -> float:
+        """Smoothed value used for logging."""
+        raise NotImplementedError
+
+
+def safe_round(number, ndigits):
+    if hasattr(number, "__round__"):
+        return round(number, ndigits)
+    elif torch is not None and torch.is_tensor(number) and number.numel() == 1:
+        return safe_round(number.item(), ndigits)
+    elif np is not None and np.ndim(number) == 0 and hasattr(number, "item"):
+        return safe_round(number.item(), ndigits)
+    else:
+        return number
+
+
+class AverageMeter(Meter):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, round: Optional[int] = None):
+        self.round = round
+        self.reset()
+
+    def reset(self):
+        self.val = None  # most recent update
+        self.sum = 0  # sum from all updates
+        self.count = 0  # total n from all updates
+
+    def update(self, val, n=1):
+        if val is not None:
+            self.val = val
+            if n > 0:
+                self.sum = type_as(self.sum, val) + (val * n)
+                self.count = type_as(self.count, n) + n
+
+    def state_dict(self):
+        return {
+            "val": self.val,
+            "sum": self.sum,
+            "count": self.count,
+            "round": self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.val = state_dict["val"]
+        self.sum = state_dict["sum"]
+        self.count = state_dict["count"]
+        self.round = state_dict.get("round", None)
+
+    @property
+    def avg(self):
+        return self.sum / self.count if self.count > 0 else self.val
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.avg
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class SumMeter(Meter):
+    """Computes and stores the sum"""
+
+    def __init__(self, round: Optional[int] = None):
+        self.round = round
+        self.reset()
+
+    def reset(self):
+        self.sum = 0  # sum from all updates
+
+    def update(self, val):
+        if val is not None:
+            self.sum = type_as(self.sum, val) + val
+
+    def state_dict(self):
+        return {
+            "sum": self.sum,
+            "round": self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.sum = state_dict["sum"]
+        self.round = state_dict.get("round", None)
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.sum
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class ConcatTensorMeter(Meter):
+    """Concatenates tensors"""
+
+    def __init__(self, dim=0):
+        super().__init__()
+        self.reset()
+        self.dim = dim
+
+    def reset(self):
+        self.tensor = None
+
+    def update(self, val):
+        if self.tensor is None:
+            self.tensor = val
+        else:
+            self.tensor = torch.cat([self.tensor, val], dim=self.dim)
+
+    def state_dict(self):
+        return {
+            "tensor": self.tensor,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.tensor = state_dict["tensor"]
+
+    @property
+    def smoothed_value(self) -> float:
+        return []  # return a dummy value
+
+
+class TimeMeter(Meter):
+    """Computes the average occurrence of some event per second"""
+
+    def __init__(
+        self,
+        init: int = 0,
+        n: int = 0,
+        round: Optional[int] = None,
+    ):
+        self.round = round
+        self.reset(init, n)
+
+    def reset(self, init=0, n=0):
+        self.init = init
+        self.start = time.perf_counter()
+        self.n = n
+        self.i = 0
+
+    def update(self, val=1):
+        self.n = type_as(self.n, val) + val
+        self.i += 1
+
+    def state_dict(self):
+        return {
+            "init": self.elapsed_time,
+            "n": self.n,
+            "round": self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        if "start" in state_dict:
+            # backwards compatibility for old state_dicts
+            self.reset(init=state_dict["init"])
+        else:
+            self.reset(init=state_dict["init"], n=state_dict["n"])
+            self.round = state_dict.get("round", None)
+
+    @property
+    def avg(self):
+        return self.n / self.elapsed_time
+
+    @property
+    def elapsed_time(self):
+        return self.init + (time.perf_counter() - self.start)
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.avg
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class StopwatchMeter(Meter):
+    """Computes the sum/avg duration of some event in seconds"""
+
+    def __init__(self, round: Optional[int] = None):
+        self.round = round
+        self.sum = 0
+        self.n = 0
+        self.start_time = None
+
+    def start(self):
+        self.start_time = time.perf_counter()
+
+    def stop(self, n=1, prehook=None):
+        if self.start_time is not None:
+            if prehook is not None:
+                prehook()
+            delta = time.perf_counter() - self.start_time
+            self.sum = self.sum + delta
+            self.n = type_as(self.n, n) + n
+
+    def reset(self):
+        self.sum = 0  # cumulative time during which stopwatch was active
+        self.n = 0  # total n across all start/stop
+        self.start()
+
+    def state_dict(self):
+        return {
+            "sum": self.sum,
+            "n": self.n,
+            "round": self.round,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.sum = state_dict["sum"]
+        self.n = state_dict["n"]
+        self.start_time = None
+        self.round = state_dict.get("round", None)
+
+    @property
+    def avg(self):
+        return self.sum / self.n if self.n > 0 else self.sum
+
+    @property
+    def elapsed_time(self):
+        if self.start_time is None:
+            return 0.0
+        return time.perf_counter() - self.start_time
+
+    @property
+    def smoothed_value(self) -> float:
+        val = self.avg if self.sum > 0 else self.elapsed_time
+        if self.round is not None and val is not None:
+            val = safe_round(val, self.round)
+        return val
+
+
+class MetersDict(OrderedDict):
+    """A sorted dictionary of :class:`Meters`.
+
+    Meters are sorted according to a priority that is given when the
+    meter is first added to the dictionary.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.priorities = []
+
+    def __setitem__(self, key, value):
+        assert key not in self, "MetersDict doesn't support reassignment"
+        priority, value = value
+        bisect.insort(self.priorities, (priority, len(self.priorities), key))
+        super().__setitem__(key, value)
+        for _, _, key in self.priorities:  # reorder dict to match priorities
+            self.move_to_end(key)
+
+    def add_meter(self, key, meter, priority):
+        self.__setitem__(key, (priority, meter))
+
+    def state_dict(self):
+        return [
+            (pri, key, self[key].__class__.__name__, self[key].state_dict())
+            for pri, _, key in self.priorities
+            # can't serialize DerivedMeter instances
+            if not isinstance(self[key], MetersDict._DerivedMeter)
+        ]
+
+    def load_state_dict(self, state_dict):
+        self.clear()
+        self.priorities.clear()
+        for pri, key, meter_cls, meter_state in state_dict:
+            meter = globals()[meter_cls]()
+            meter.load_state_dict(meter_state)
+            self.add_meter(key, meter, pri)
+
+    def get_smoothed_value(self, key: str) -> float:
+        """Get a single smoothed value."""
+        meter = self[key]
+        if isinstance(meter, MetersDict._DerivedMeter):
+            return meter.fn(self)
+        else:
+            return meter.smoothed_value
+
+    def get_smoothed_values(self) -> Dict[str, float]:
+        """Get all smoothed values."""
+        return OrderedDict(
+            [
+                (key, self.get_smoothed_value(key))
+                for key in self.keys()
+                if not key.startswith("_")
+            ]
+        )
+
+    def reset(self):
+        """Reset Meter instances."""
+        for meter in self.values():
+            if isinstance(meter, MetersDict._DerivedMeter):
+                continue
+            meter.reset()
+
+    class _DerivedMeter(Meter):
+        """A Meter whose values are derived from other Meters."""
+
+        def __init__(self, fn):
+            self.fn = fn
+
+        def reset(self):
+            pass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
new file mode 100644
index 0000000000..49301f27f8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
@@ -0,0 +1,336 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+A standalone module for aggregating metrics.
+
+Metrics can be logged from anywhere using the `log_*` functions defined
+in this module. The logged values will be aggregated dynamically based
+on the aggregation context in which the logging occurs. See the
+:func:`aggregate` context manager for more details.
+"""
+
+import contextlib
+import uuid
+from collections import defaultdict
+from typing import Callable, List, Optional
+
+from .meters import *
+
+
+# Aggregation contexts are considered "active" when inside the scope
+# created by the :func:`aggregate` context manager.
+_aggregators = OrderedDict()
+_active_aggregators = OrderedDict()
+_active_aggregators_cnt = defaultdict(lambda: 0)
+
+
+def reset() -> None:
+    """Reset all metrics aggregators."""
+    _aggregators.clear()
+    _active_aggregators.clear()
+    _active_aggregators_cnt.clear()
+
+    # The "default" aggregator observes all logged values.
+    _aggregators["default"] = MetersDict()
+    _active_aggregators["default"] = _aggregators["default"]
+    _active_aggregators_cnt["default"] = 1
+
+
+reset()
+
+
+@contextlib.contextmanager
+def aggregate(name: Optional[str] = None, new_root: bool = False):
+    """Context manager to aggregate metrics under a given name.
+
+    Aggregations can be nested. If *new_root* is ``False``, then logged
+    metrics will be recorded along the entire stack of nested
+    aggregators, including a global "default" aggregator. If *new_root*
+    is ``True``, then this aggregator will be the root of a new
+    aggregation stack, thus bypassing any parent aggregators.
+
+    Note that aggregation contexts are uniquely identified by their
+    *name* (e.g., train, valid). Creating a context with an existing
+    name will reuse the corresponding :class:`MetersDict` instance.
+    If no name is given, then a temporary aggregator will be created.
+
+    Usage::
+
+        with metrics.aggregate("train"):
+            for step, batch in enumerate(epoch):
+                with metrics.aggregate("train_inner") as agg:
+                    metrics.log_scalar("loss", get_loss(batch))
+                    if step % log_interval == 0:
+                        print(agg.get_smoothed_value("loss"))
+                        agg.reset()
+        print(metrics.get_smoothed_values("train")["loss"])
+
+    Args:
+        name (str): name of the aggregation. Defaults to a
+            random/temporary name if not given explicitly.
+        new_root (bool): make this aggregation the root of a new
+            aggregation stack.
+    """
+    if name is None:
+        # generate a temporary name
+        name = str(uuid.uuid4())
+        assert name not in _aggregators
+        agg = MetersDict()
+    else:
+        assert name != "default"
+        agg = _aggregators.setdefault(name, MetersDict())
+
+    if new_root:
+        backup_aggregators = _active_aggregators.copy()
+        _active_aggregators.clear()
+        backup_aggregators_cnt = _active_aggregators_cnt.copy()
+        _active_aggregators_cnt.clear()
+
+    _active_aggregators[name] = agg
+    _active_aggregators_cnt[name] += 1
+
+    yield agg
+
+    _active_aggregators_cnt[name] -= 1
+    if _active_aggregators_cnt[name] == 0 and name in _active_aggregators:
+        del _active_aggregators[name]
+
+    if new_root:
+        _active_aggregators.clear()
+        _active_aggregators.update(backup_aggregators)
+        _active_aggregators_cnt.clear()
+        _active_aggregators_cnt.update(backup_aggregators_cnt)
+
+
+def get_active_aggregators() -> List[MetersDict]:
+    return list(_active_aggregators.values())
+
+
+def log_scalar(
+    key: str,
+    value: float,
+    weight: float = 1,
+    priority: int = 10,
+    round: Optional[int] = None,
+):
+    """Log a scalar value.
+
+    Args:
+        key (str): name of the field to log
+        value (float): value to log
+        weight (float): weight that this value contributes to the average.
+            A weight of 0 will always log the latest value.
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, AverageMeter(round=round), priority)
+        agg[key].update(value, weight)
+
+
+def log_scalar_sum(
+    key: str,
+    value: float,
+    priority: int = 10,
+    round: Optional[int] = None,
+):
+    """Log a scalar value that is summed for reporting.
+
+    Args:
+        key (str): name of the field to log
+        value (float): value to log
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, SumMeter(round=round), priority)
+        agg[key].update(value)
+
+
+def log_concat_tensor(
+    key: str,
+    value: torch.Tensor,
+    priority: int = 10,
+    dim: int = 0,
+):
+    """Log a scalar value that is summed for reporting.
+
+    Args:
+        key (str): name of the field to log
+        value (float): value to log
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, ConcatTensorMeter(dim=dim), priority)
+        agg[key].update(value)
+
+
+def log_derived(key: str, fn: Callable[[MetersDict], float], priority: int = 20):
+    """Log a scalar value derived from other meters.
+
+    Args:
+        key (str): name of the field to log
+        fn (Callable[[MetersDict], float]): function that takes a single
+            argument *meters* and returns the derived value
+        priority (int): smaller values are logged earlier in the output
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, MetersDict._DerivedMeter(fn), priority)
+
+
+def log_speed(
+    key: str,
+    value: float,
+    priority: int = 30,
+    round: Optional[int] = None,
+):
+    """Log the rate of some quantity per second.
+
+    Args:
+        key (str): name of the field to log
+        value (float): value to log
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, TimeMeter(round=round), priority)
+            agg[key].reset()  # reset meter on the first call
+        else:
+            agg[key].update(value)
+
+
+def log_start_time(key: str, priority: int = 40, round: Optional[int] = None):
+    """Log the duration of some event in seconds.
+
+    The duration will be computed once :func:`log_stop_time` is called.
+
+    Args:
+        key (str): name of the field to log
+        priority (int): smaller values are logged earlier in the output
+        round (Optional[int]): number of digits to round to when displaying
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, StopwatchMeter(round=round), priority)
+        agg[key].start()
+
+
+def log_stop_time(key: str, weight: float = 0.0, prehook=None):
+    """Log the duration of some event in seconds.
+
+    The duration will be computed since :func:`log_start_time` was called.
+    Set weight > 0 to report the average time instead of the sum.
+
+    Args:
+        key (str): name of the field to log
+        weight (float): weight that this time contributes to the average
+        prehook (function, no arguments): will be called before the timer
+        is stopped. For example, use prehook=torch.cuda.synchronize to
+        make sure all gpu operations are done before timer is stopped.
+    """
+    for agg in get_active_aggregators():
+        if key in agg:
+            agg[key].stop(weight, prehook)
+
+
+def log_custom(
+    new_meter_fn: Callable[[], Meter],
+    key: str,
+    *args,
+    priority: int = 50,
+    **kwargs,
+):
+    """Log using a custom Meter.
+
+    Any extra *args* or *kwargs* will be passed through to the Meter's
+    *update* method.
+
+    Args:
+        new_meter_fn (Callable[[], Meter]): function that returns a new
+            Meter instance
+        key (str): name of the field to log
+        priority (int): smaller values are logged earlier in the output
+    """
+    for agg in get_active_aggregators():
+        if key not in agg:
+            agg.add_meter(key, new_meter_fn(), priority)
+        agg[key].update(*args, **kwargs)
+
+
+def reset_meter(name: str, key: str) -> None:
+    """Reset Meter instance aggregated under a given *name* and *key*."""
+    meter = get_meter(name, key)
+    if meter is not None:
+        meter.reset()
+
+
+def reset_meters(name: str) -> None:
+    """Reset Meter instances aggregated under a given *name*."""
+    meters = get_meters(name)
+    if meters is not None:
+        meters.reset()
+
+
+def get_meter(name: str, key: str) -> Meter:
+    """Get a single Meter instance aggregated under *name* and *key*.
+
+    Returns:
+        Meter or None if no metrics have been logged under *name* and *key*.
+    """
+    if name not in _aggregators:
+        return None
+    return _aggregators[name].get(key, None)
+
+
+def get_meters(name: str) -> MetersDict:
+    """Get Meter instances aggregated under a given *name*.
+
+    Returns:
+        MetersDict or None if no metrics have been logged under *name*.
+    """
+    return _aggregators.get(name, None)
+
+
+def get_smoothed_value(name: str, key: str) -> float:
+    """Get a single smoothed value.
+
+    Raises:
+        KeyError: if no metrics have been logged under *name* and *key*.
+    """
+    return _aggregators[name].get_smoothed_value(key)
+
+
+def get_smoothed_values(name: str) -> Dict[str, float]:
+    """Get smoothed values aggregated under a given *name*.
+
+    Raises:
+        KeyError: if no metrics have been logged under *name*.
+    """
+    return _aggregators[name].get_smoothed_values()
+
+
+def state_dict():
+    return OrderedDict([(name, agg.state_dict()) for name, agg in _aggregators.items()])
+
+
+def load_state_dict(state_dict):
+    for name, agg_state in state_dict.items():
+        _aggregators[name] = MetersDict()
+        _aggregators[name].load_state_dict(agg_state)
+
+
+def xla_metrics_report():
+    try:
+        import torch_xla.debug.metrics as met
+
+        print(met.metrics_report())
+    except ImportError:
+        return
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
new file mode 100644
index 0000000000..4c64b61bad
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
@@ -0,0 +1,582 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Wrapper around various loggers and progress bars (e.g., tqdm).
+"""
+
+import atexit
+import json
+import logging
+import os
+import sys
+from collections import OrderedDict
+from contextlib import contextmanager
+from numbers import Number
+from typing import Optional
+
+import torch
+
+from .meters import AverageMeter, StopwatchMeter, TimeMeter
+
+logger = logging.getLogger(__name__)
+
+
+def progress_bar(
+    iterator,
+    log_format: Optional[str] = None,
+    log_interval: int = 100,
+    log_file: Optional[str] = None,
+    epoch: Optional[int] = None,
+    prefix: Optional[str] = None,
+    aim_repo: Optional[str] = None,
+    aim_run_hash: Optional[str] = None,
+    aim_param_checkpoint_dir: Optional[str] = None,
+    tensorboard_logdir: Optional[str] = None,
+    default_log_format: str = "tqdm",
+    wandb_project: Optional[str] = None,
+    wandb_run_name: Optional[str] = None,
+    azureml_logging: Optional[bool] = False,
+):
+    if log_format is None:
+        log_format = default_log_format
+    if log_file is not None:
+        handler = logging.FileHandler(filename=log_file)
+        logger.addHandler(handler)
+
+    if log_format == "tqdm" and not sys.stderr.isatty():
+        log_format = "simple"
+
+    if log_format == "json":
+        bar = JsonProgressBar(iterator, epoch, prefix, log_interval)
+    elif log_format == "none":
+        bar = NoopProgressBar(iterator, epoch, prefix)
+    elif log_format == "simple":
+        bar = SimpleProgressBar(iterator, epoch, prefix, log_interval)
+    elif log_format == "tqdm":
+        bar = TqdmProgressBar(iterator, epoch, prefix)
+    else:
+        raise ValueError("Unknown log format: {}".format(log_format))
+
+    if aim_repo:
+        bar = AimProgressBarWrapper(
+            bar,
+            aim_repo=aim_repo,
+            aim_run_hash=aim_run_hash,
+            aim_param_checkpoint_dir=aim_param_checkpoint_dir,
+        )
+
+    if tensorboard_logdir:
+        try:
+            # [FB only] custom wrapper for TensorBoard
+            import palaas  # noqa
+
+            from .fb_tbmf_wrapper import FbTbmfWrapper
+
+            bar = FbTbmfWrapper(bar, log_interval)
+        except ImportError:
+            bar = TensorboardProgressBarWrapper(bar, tensorboard_logdir)
+
+    if wandb_project:
+        bar = WandBProgressBarWrapper(bar, wandb_project, run_name=wandb_run_name)
+
+    if azureml_logging:
+        bar = AzureMLProgressBarWrapper(bar)
+
+    return bar
+
+
+def build_progress_bar(
+    args,
+    iterator,
+    epoch: Optional[int] = None,
+    prefix: Optional[str] = None,
+    default: str = "tqdm",
+    no_progress_bar: str = "none",
+):
+    """Legacy wrapper that takes an argparse.Namespace."""
+    if getattr(args, "no_progress_bar", False):
+        default = no_progress_bar
+    if getattr(args, "distributed_rank", 0) == 0:
+        tensorboard_logdir = getattr(args, "tensorboard_logdir", None)
+    else:
+        tensorboard_logdir = None
+    return progress_bar(
+        iterator,
+        log_format=args.log_format,
+        log_interval=args.log_interval,
+        epoch=epoch,
+        prefix=prefix,
+        tensorboard_logdir=tensorboard_logdir,
+        default_log_format=default,
+    )
+
+
+def format_stat(stat):
+    if isinstance(stat, Number):
+        stat = "{:g}".format(stat)
+    elif isinstance(stat, AverageMeter):
+        stat = "{:.3f}".format(stat.avg)
+    elif isinstance(stat, TimeMeter):
+        stat = "{:g}".format(round(stat.avg))
+    elif isinstance(stat, StopwatchMeter):
+        stat = "{:g}".format(round(stat.sum))
+    elif torch.is_tensor(stat):
+        stat = stat.tolist()
+    return stat
+
+
+class BaseProgressBar(object):
+    """Abstract class for progress bars."""
+
+    def __init__(self, iterable, epoch=None, prefix=None):
+        self.iterable = iterable
+        self.n = getattr(iterable, "n", 0)
+        self.epoch = epoch
+        self.prefix = ""
+        if epoch is not None:
+            self.prefix += "epoch {:03d}".format(epoch)
+        if prefix is not None:
+            self.prefix += (" | " if self.prefix != "" else "") + prefix
+
+    def __len__(self):
+        return len(self.iterable)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        return False
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        raise NotImplementedError
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        raise NotImplementedError
+
+    def update_config(self, config):
+        """Log latest configuration."""
+        pass
+
+    def _str_commas(self, stats):
+        return ", ".join(key + "=" + stats[key].strip() for key in stats.keys())
+
+    def _str_pipes(self, stats):
+        return " | ".join(key + " " + stats[key].strip() for key in stats.keys())
+
+    def _format_stats(self, stats):
+        postfix = OrderedDict(stats)
+        # Preprocess stats according to datatype
+        for key in postfix.keys():
+            postfix[key] = str(format_stat(postfix[key]))
+        return postfix
+
+
+@contextmanager
+def rename_logger(logger, new_name):
+    old_name = logger.name
+    if new_name is not None:
+        logger.name = new_name
+    yield logger
+    logger.name = old_name
+
+
+class JsonProgressBar(BaseProgressBar):
+    """Log output in JSON format."""
+
+    def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000):
+        super().__init__(iterable, epoch, prefix)
+        self.log_interval = log_interval
+        self.i = None
+        self.size = None
+
+    def __iter__(self):
+        self.size = len(self.iterable)
+        for i, obj in enumerate(self.iterable, start=self.n):
+            self.i = i
+            yield obj
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        step = step or self.i or 0
+        if step > 0 and self.log_interval is not None and step % self.log_interval == 0:
+            update = (
+                self.epoch - 1 + (self.i + 1) / float(self.size)
+                if self.epoch is not None
+                else None
+            )
+            stats = self._format_stats(stats, epoch=self.epoch, update=update)
+            with rename_logger(logger, tag):
+                logger.info(json.dumps(stats))
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        self.stats = stats
+        if tag is not None:
+            self.stats = OrderedDict(
+                [(tag + "_" + k, v) for k, v in self.stats.items()]
+            )
+        stats = self._format_stats(self.stats, epoch=self.epoch)
+        with rename_logger(logger, tag):
+            logger.info(json.dumps(stats))
+
+    def _format_stats(self, stats, epoch=None, update=None):
+        postfix = OrderedDict()
+        if epoch is not None:
+            postfix["epoch"] = epoch
+        if update is not None:
+            postfix["update"] = round(update, 3)
+        # Preprocess stats according to datatype
+        for key in stats.keys():
+            postfix[key] = format_stat(stats[key])
+        return postfix
+
+
+class NoopProgressBar(BaseProgressBar):
+    """No logging."""
+
+    def __init__(self, iterable, epoch=None, prefix=None):
+        super().__init__(iterable, epoch, prefix)
+
+    def __iter__(self):
+        for obj in self.iterable:
+            yield obj
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        pass
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        pass
+
+
+class SimpleProgressBar(BaseProgressBar):
+    """A minimal logger for non-TTY environments."""
+
+    def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000):
+        super().__init__(iterable, epoch, prefix)
+        self.log_interval = log_interval
+        self.i = None
+        self.size = None
+
+    def __iter__(self):
+        self.size = len(self.iterable)
+        for i, obj in enumerate(self.iterable, start=self.n):
+            self.i = i
+            yield obj
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        step = step or self.i or 0
+        if step > 0 and self.log_interval is not None and step % self.log_interval == 0:
+            stats = self._format_stats(stats)
+            postfix = self._str_commas(stats)
+            with rename_logger(logger, tag):
+                logger.info(
+                    "{}:  {:5d} / {:d} {}".format(
+                        self.prefix, self.i + 1, self.size, postfix
+                    )
+                )
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        postfix = self._str_pipes(self._format_stats(stats))
+        with rename_logger(logger, tag):
+            logger.info("{} | {}".format(self.prefix, postfix))
+
+
+class TqdmProgressBar(BaseProgressBar):
+    """Log to tqdm."""
+
+    def __init__(self, iterable, epoch=None, prefix=None):
+        super().__init__(iterable, epoch, prefix)
+        from tqdm import tqdm
+
+        self.tqdm = tqdm(
+            iterable,
+            self.prefix,
+            leave=False,
+            disable=(logger.getEffectiveLevel() > logging.INFO),
+        )
+
+    def __iter__(self):
+        return iter(self.tqdm)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats according to log_interval."""
+        self.tqdm.set_postfix(self._format_stats(stats), refresh=False)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        postfix = self._str_pipes(self._format_stats(stats))
+        with rename_logger(logger, tag):
+            logger.info("{} | {}".format(self.prefix, postfix))
+
+
+try:
+    import functools
+
+    from aim import Repo as AimRepo
+
+    @functools.lru_cache()
+    def get_aim_run(repo, run_hash):
+        from aim import Run
+
+        return Run(run_hash=run_hash, repo=repo)
+
+except ImportError:
+    get_aim_run = None
+    AimRepo = None
+
+
+class AimProgressBarWrapper(BaseProgressBar):
+    """Log to Aim."""
+
+    def __init__(self, wrapped_bar, aim_repo, aim_run_hash, aim_param_checkpoint_dir):
+        self.wrapped_bar = wrapped_bar
+
+        if get_aim_run is None:
+            self.run = None
+            logger.warning("Aim not found, please install with: pip install aim")
+        else:
+            logger.info(f"Storing logs at Aim repo: {aim_repo}")
+
+            if not aim_run_hash:
+                # Find run based on save_dir parameter
+                query = f"run.checkpoint.save_dir == '{aim_param_checkpoint_dir}'"
+                try:
+                    runs_generator = AimRepo(aim_repo).query_runs(query)
+                    run = next(runs_generator.iter_runs())
+                    aim_run_hash = run.run.hash
+                except Exception:
+                    pass
+
+            if aim_run_hash:
+                logger.info(f"Appending to run: {aim_run_hash}")
+
+            self.run = get_aim_run(aim_repo, aim_run_hash)
+
+    def __iter__(self):
+        return iter(self.wrapped_bar)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats to Aim."""
+        self._log_to_aim(stats, tag, step)
+        self.wrapped_bar.log(stats, tag=tag, step=step)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        self._log_to_aim(stats, tag, step)
+        self.wrapped_bar.print(stats, tag=tag, step=step)
+
+    def update_config(self, config):
+        """Log latest configuration."""
+        if self.run is not None:
+            for key in config:
+                self.run.set(key, config[key], strict=False)
+        self.wrapped_bar.update_config(config)
+
+    def _log_to_aim(self, stats, tag=None, step=None):
+        if self.run is None:
+            return
+
+        if step is None:
+            step = stats["num_updates"]
+
+        if "train" in tag:
+            context = {"tag": tag, "subset": "train"}
+        elif "val" in tag:
+            context = {"tag": tag, "subset": "val"}
+        else:
+            context = {"tag": tag}
+
+        for key in stats.keys() - {"num_updates"}:
+            self.run.track(stats[key], name=key, step=step, context=context)
+
+
+try:
+    _tensorboard_writers = {}
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    try:
+        from tensorboardX import SummaryWriter
+    except ImportError:
+        SummaryWriter = None
+
+
+def _close_writers():
+    for w in _tensorboard_writers.values():
+        w.close()
+
+
+atexit.register(_close_writers)
+
+
+class TensorboardProgressBarWrapper(BaseProgressBar):
+    """Log to tensorboard."""
+
+    def __init__(self, wrapped_bar, tensorboard_logdir):
+        self.wrapped_bar = wrapped_bar
+        self.tensorboard_logdir = tensorboard_logdir
+
+        if SummaryWriter is None:
+            logger.warning(
+                "tensorboard not found, please install with: pip install tensorboard"
+            )
+
+    def _writer(self, key):
+        if SummaryWriter is None:
+            return None
+        _writers = _tensorboard_writers
+        if key not in _writers:
+            _writers[key] = SummaryWriter(os.path.join(self.tensorboard_logdir, key))
+            _writers[key].add_text("sys.argv", " ".join(sys.argv))
+        return _writers[key]
+
+    def __iter__(self):
+        return iter(self.wrapped_bar)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats to tensorboard."""
+        self._log_to_tensorboard(stats, tag, step)
+        self.wrapped_bar.log(stats, tag=tag, step=step)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        self._log_to_tensorboard(stats, tag, step)
+        self.wrapped_bar.print(stats, tag=tag, step=step)
+
+    def update_config(self, config):
+        """Log latest configuration."""
+        # TODO add hparams to Tensorboard
+        self.wrapped_bar.update_config(config)
+
+    def _log_to_tensorboard(self, stats, tag=None, step=None):
+        writer = self._writer(tag or "")
+        if writer is None:
+            return
+        if step is None:
+            step = stats["num_updates"]
+        for key in stats.keys() - {"num_updates"}:
+            if isinstance(stats[key], AverageMeter):
+                writer.add_scalar(key, stats[key].val, step)
+            elif isinstance(stats[key], Number):
+                writer.add_scalar(key, stats[key], step)
+            elif torch.is_tensor(stats[key]) and stats[key].numel() == 1:
+                writer.add_scalar(key, stats[key].item(), step)
+        writer.flush()
+
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+
+class WandBProgressBarWrapper(BaseProgressBar):
+    """Log to Weights & Biases."""
+
+    def __init__(self, wrapped_bar, wandb_project, run_name=None):
+        self.wrapped_bar = wrapped_bar
+        if wandb is None:
+            logger.warning("wandb not found, pip install wandb")
+            return
+
+        # reinit=False to ensure if wandb.init() is called multiple times
+        # within one process it still references the same run
+        wandb.init(project=wandb_project, reinit=False, name=run_name)
+
+    def __iter__(self):
+        return iter(self.wrapped_bar)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats to tensorboard."""
+        self._log_to_wandb(stats, tag, step)
+        self.wrapped_bar.log(stats, tag=tag, step=step)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats."""
+        self._log_to_wandb(stats, tag, step)
+        self.wrapped_bar.print(stats, tag=tag, step=step)
+
+    def update_config(self, config):
+        """Log latest configuration."""
+        if wandb is not None:
+            wandb.config.update(config)
+        self.wrapped_bar.update_config(config)
+
+    def _log_to_wandb(self, stats, tag=None, step=None):
+        if wandb is None:
+            return
+        if step is None:
+            step = stats["num_updates"]
+
+        prefix = "" if tag is None else tag + "/"
+
+        for key in stats.keys() - {"num_updates"}:
+            if isinstance(stats[key], AverageMeter):
+                wandb.log({prefix + key: stats[key].val}, step=step)
+            elif isinstance(stats[key], Number):
+                wandb.log({prefix + key: stats[key]}, step=step)
+
+
+try:
+    from azureml.core import Run
+except ImportError:
+    Run = None
+
+
+class AzureMLProgressBarWrapper(BaseProgressBar):
+    """Log to Azure ML"""
+
+    def __init__(self, wrapped_bar):
+        self.wrapped_bar = wrapped_bar
+        if Run is None:
+            logger.warning("azureml.core not found, pip install azureml-core")
+            return
+        self.run = Run.get_context()
+
+    def __exit__(self, *exc):
+        if Run is not None:
+            self.run.complete()
+        return False
+
+    def __iter__(self):
+        return iter(self.wrapped_bar)
+
+    def log(self, stats, tag=None, step=None):
+        """Log intermediate stats to AzureML"""
+        self._log_to_azureml(stats, tag, step)
+        self.wrapped_bar.log(stats, tag=tag, step=step)
+
+    def print(self, stats, tag=None, step=None):
+        """Print end-of-epoch stats"""
+        self._log_to_azureml(stats, tag, step)
+        self.wrapped_bar.print(stats, tag=tag, step=step)
+
+    def update_config(self, config):
+        """Log latest configuration."""
+        self.wrapped_bar.update_config(config)
+
+    def _log_to_azureml(self, stats, tag=None, step=None):
+        if Run is None:
+            return
+        if step is None:
+            step = stats["num_updates"]
+
+        prefix = "" if tag is None else tag + "/"
+
+        for key in stats.keys() - {"num_updates"}:
+            name = prefix + key
+            if isinstance(stats[key], AverageMeter):
+                self.run.log_row(name=name, **{"step": step, key: stats[key].val})
+            elif isinstance(stats[key], Number):
+                self.run.log_row(name=name, **{"step": step, key: stats[key]})
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
new file mode 100644
index 0000000000..69f2168487
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import criterions, models, modules  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
new file mode 100644
index 0000000000..5fae7bd4c2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+
+# automatically import any Python files in the criterions/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("fairseq.model_parallel.criterions." + module)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
new file mode 100644
index 0000000000..5ffbaa8764
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+try:
+    from fairseq.model_parallel.megatron.mpu.cross_entropy import (
+        vocab_parallel_cross_entropy,
+    )
+
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+@register_criterion("vocab_parallel_cross_entropy")
+class VocabParallelCrossEntropyCriterion(FairseqCriterion):
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        if not has_megatron_submodule:
+            raise ImportError(
+                "\n\nPlease install the megatron submodule:"
+                "\n\n  git submodule update --init "
+                "fairseq/model_parallel/megatron"
+            )
+
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        target = sample["target"]
+
+        loss = vocab_parallel_cross_entropy(net_output[0].float(), target)
+        loss = (loss * (target != self.padding_idx)).sum()
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        if sample_size != ntokens:
+            metrics.log_scalar(
+                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+            )
+        else:
+            metrics.log_derived(
+                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+            )
+
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
new file mode 100644
index 0000000000..aedf608bce
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
@@ -0,0 +1,75 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Train a network across multiple GPUs.
+"""
+
+from fairseq.dataclass.configs import FairseqConfig
+from fairseq.distributed import utils as distributed_utils
+from fairseq.trainer import Trainer
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        get_data_parallel_rank,
+        get_data_parallel_world_size,
+        get_model_parallel_src_rank,
+        get_cuda_rng_tracker,
+    )
+
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+class MegatronTrainer(Trainer):
+    """Main class for model parallel with data parallel training."""
+
+    def __init__(self, cfg: FairseqConfig, task, model, criterion, **kwargs):
+        if not has_megatron_submodule:
+            raise ImportError(
+                "\n\nPlease install the megatron submodule:"
+                "\n\n  git submodule update --init "
+                "fairseq/model_parallel/megatron"
+            )
+        super().__init__(cfg, task, model, criterion, **kwargs)
+
+    def clip_grad_norm(self, clip_norm):
+        def _aggregate_model_parallel_grad_norm(total_norm):
+            total_norm = total_norm**2
+            distributed_utils.all_reduce(
+                total_norm, group=distributed_utils.get_model_parallel_group()
+            )
+            total_norm = total_norm**0.5
+            return total_norm
+
+        return self.optimizer.clip_grad_norm(
+            clip_norm,
+            aggregate_norm_fn=_aggregate_model_parallel_grad_norm,
+        )
+
+    def save_checkpoint(self, filename, extra_state):
+        """Save all training state in a checkpoint file."""
+        extra_state["rng_tracker_states"] = get_cuda_rng_tracker().get_states()
+        super().save_checkpoint(filename, extra_state)
+
+    def load_checkpoint(
+        self,
+        filename,
+        reset_optimizer=False,
+        reset_lr_scheduler=False,
+        optimizer_overrides=None,
+        reset_meters=False,
+    ):
+        extra_state = super().load_checkpoint(
+            filename,
+            reset_optimizer=reset_optimizer,
+            reset_lr_scheduler=reset_lr_scheduler,
+            optimizer_overrides=optimizer_overrides,
+            reset_meters=reset_meters,
+        )
+        if extra_state is not None and "rng_tracker_states" in extra_state:
+            get_cuda_rng_tracker().set_states(extra_state["rng_tracker_states"])
+        return extra_state
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
new file mode 100644
index 0000000000..3532479e52
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+
+# automatically import any Python files in the models/ directory
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if (
+        not file.startswith("_")
+        and not file.startswith(".")
+        and (file.endswith(".py") or os.path.isdir(path))
+    ):
+        model_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module("fairseq.model_parallel.models." + model_name)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
new file mode 100644
index 0000000000..117827c3e9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
new file mode 100644
index 0000000000..85dbd44b3c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
@@ -0,0 +1,600 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from collections import namedtuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import options, utils
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    LayerNorm,
+    MultiheadAttention,
+    PositionalEmbedding,
+)
+
+EncoderOut = namedtuple(
+    "TransformerEncoderOut",
+    [
+        "encoder_out",  # T x B x C
+        "encoder_padding_mask",  # B x T
+        "encoder_embedding",  # B x T x C
+        "encoder_states",  # List[T x B x C]
+    ],
+)
+
+
+class TransformerEncoderEmbedding(nn.Module):
+    """Encoder Embedding + Positional Embedding"""
+
+    def __init__(self, args, embed_tokens):
+        super().__init__()
+        self.dropout = args.dropout
+        self.max_source_positions = args.max_source_positions
+        self.embed_tokens = embed_tokens
+        if isinstance(embed_tokens, nn.ModuleList):
+            self.padding_idx = embed_tokens[0].padding_idx
+            embed_dim = sum(e.embedding_dim for e in embed_tokens)
+        else:
+            self.padding_idx = embed_tokens.padding_idx
+            embed_dim = embed_tokens.embedding_dim
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=args.encoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+
+    def forward(self, input):
+        # embed tokens and positions
+        src_tokens = input[0]
+        prev_output_tokens = input[2]
+        if isinstance(self.embed_tokens, nn.ModuleList):
+            x_embed_list = []
+            for embed_tokens_part in self.embed_tokens:
+                x_embed_list.append(embed_tokens_part(src_tokens))
+
+            embedded = torch.cat(x_embed_list, dim=-1)
+        else:
+            embedded = self.embed_tokens(src_tokens)
+        x = embed = self.embed_scale * embedded
+        if self.embed_positions is not None:
+            x = embed + self.embed_positions(src_tokens)
+        if self.layernorm_embedding:
+            x = self.layernorm_embedding(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        return (x, encoder_padding_mask, prev_output_tokens)
+
+
+class TransformerEncoderLayerNorm(nn.Module):
+    """
+    Layer norm at the the end of all encoder layers if
+    args.encoder_enormalize_before = True
+    """
+
+    def __init__(self, args, embed_dim):
+        super().__init__()
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(self, input):
+        x = input[0]
+        encoder_padding_mask = input[1]
+        prev_output_tokens = input[2]
+        if self.layer_norm:
+            x = self.layer_norm(x)
+        # keeping track of the incremental_state is not supported yet
+        return (x, encoder_padding_mask, prev_output_tokens)
+
+
+class TransformerDecoderEmbedding(nn.Module):
+    """Decoder Embedding + Positional Embedding"""
+
+    def __init__(self, args, embed_tokens):
+        super().__init__()
+        self.dropout = args.dropout
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+        input_embed_dim = (
+            sum(e.embedding_dim for e in embed_tokens)
+            if isinstance(embed_tokens, nn.ModuleList)
+            else embed_tokens.embedding_dim
+        )
+        embed_dim = args.decoder_embed_dim
+        self.output_embed_dim = args.decoder_output_dim
+
+        padding_idx = (
+            embed_tokens[0].padding_idx
+            if isinstance(embed_tokens, nn.ModuleList)
+            else embed_tokens.padding_idx
+        )
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_target_positions,
+                embed_dim,
+                padding_idx,
+                learned=args.decoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+    def forward(self, input):
+        mt_task = False
+        if isinstance(input, tuple):
+            if len(input) == 3:
+                encoder_out = input[0]
+                encoder_padding_mask = input[1]
+                prev_output_tokens = input[2]
+                incremental_state = None  # Hardcoding to avoid passing of None objects
+                mt_task = True
+            else:
+                # HACK for now, need to fix (TODO sidgoyal)
+                prev_output_tokens = input[0]
+                # discard "src_lengths"
+                encoder_out = None
+                encoder_padding_mask = None
+                incremental_state = None
+
+        else:
+            prev_output_tokens = input
+            encoder_out = None
+            encoder_padding_mask = None
+            incremental_state = None
+
+        positions = (
+            self.embed_positions(
+                prev_output_tokens,
+                incremental_state=incremental_state,
+            )
+            if self.embed_positions is not None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+
+        if isinstance(self.embed_tokens, nn.ModuleList):
+            x_embed_list = []
+            for embed_tokens_part in self.embed_tokens:
+                x_embed_list.append(embed_tokens_part(prev_output_tokens))
+
+            x = self.embed_scale * torch.cat(x_embed_list, dim=-1)
+        else:
+            x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        if mt_task:
+            return (x, encoder_out, encoder_padding_mask)
+        return x
+
+
+class TransformerDecoderOutputLayer(nn.Module):
+    def __init__(self, args, embed_tokens, dictionary):
+        super().__init__()
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+        self.embed_tokens = embed_tokens
+        self.output_embed_dim = args.decoder_output_dim
+        embed_dim = args.decoder_embed_dim
+
+        self.project_out_dim = (
+            Linear(embed_dim, self.output_embed_dim, bias=False)
+            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
+            else None
+        )
+        self.adaptive_softmax = None
+        if args.adaptive_softmax_cutoff is not None:
+            assert not isinstance(embed_tokens, nn.ModuleList)
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                self.output_embed_dim,
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
+                factor=args.adaptive_softmax_factor,
+                tie_proj=args.tie_adaptive_proj,
+            )
+        elif not self.share_input_output_embed:
+            self.embed_tokens = nn.Parameter(
+                torch.Tensor(len(dictionary), self.output_embed_dim)
+            )
+            nn.init.normal_(
+                self.embed_tokens, mean=0, std=self.output_embed_dim**-0.5
+            )
+
+        if args.decoder_normalize_before and not getattr(
+            args, "no_decoder_final_norm", False
+        ):
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(self, input, apply_final_proj=True):
+        if isinstance(input, tuple):
+            x = input[0]
+        else:
+            x = input
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+        if apply_final_proj:
+            x = self.output_layer(x)
+        return x
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            if self.share_input_output_embed:
+                if isinstance(self.embed_tokens, nn.ModuleList):
+                    output = None
+                    for i, emb in enumerate(self.embed_tokens):
+                        sidx = i * emb.embedding_dim
+                        eidx = (i + 1) * emb.embedding_dim
+                        if output is None:
+                            output = F.linear(features[:, :, sidx:eidx], emb.weight)
+                        else:
+                            output += F.linear(features[:, :, sidx:eidx], emb.weight)
+
+                    return output
+                else:
+                    return F.linear(features, self.embed_tokens.weight)
+            else:
+                return F.linear(features, self.embed_tokens)
+        else:
+            return features
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.encoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+
+    def __init__(self, args):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.self_attn = MultiheadAttention(
+            self.embed_dim,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+        )
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout = args.dropout
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, "activation_fn", "relu")
+        )
+        self.activation_dropout = getattr(args, "activation_dropout", 0)
+        if self.activation_dropout == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            self.activation_dropout = getattr(args, "relu_dropout", 0)
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
+        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
+        for old, new in layer_norm_map.items():
+            for m in ("weight", "bias"):
+                k = "{}.layer_norms.{}.{}".format(name, old, m)
+                if k in state_dict:
+                    state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
+                    del state_dict[k]
+
+    def forward(self, input):
+        """
+        Args:
+            input (Tuple):
+                input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+                input[1] (ByteTensor/FloatTensor): encoder padding mask -
+                    binary ByteTensor of shape `(batch, src_len)` where padding elements
+                    are indicated by ``1``.
+                input[2] (LongTensor): previous decoder outputs of shape
+                    `(batch, tgt_len)`, for teacher forcing)
+        Returns:
+            output (Tuple):
+                output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)`
+                output[1] (ByteTensor/FloatTensor): encoder padding mask
+                output[2] (LongTensor): previous decoder outputs
+        """
+        x = input[0]
+        encoder_padding_mask = input[1]
+        prev_output_tokens = input[2]
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        x, _ = self.self_attn(
+            query=x, key=x, value=x, key_padding_mask=encoder_padding_mask
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        return (x, encoder_padding_mask, prev_output_tokens)
+
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+
+
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block.
+
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.decoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.self_attn = MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=True,
+        )
+        self.dropout = args.dropout
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, "activation_fn", "relu")
+        )
+        self.activation_dropout = getattr(args, "activation_dropout", 0)
+        if self.activation_dropout == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            self.activation_dropout = getattr(args, "relu_dropout", 0)
+        self.normalize_before = args.decoder_normalize_before
+
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, "char_inputs", False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = MultiheadAttention(
+                self.embed_dim,
+                args.decoder_attention_heads,
+                kdim=getattr(args, "encoder_embed_dim", None),
+                vdim=getattr(args, "encoder_embed_dim", None),
+                dropout=args.attention_dropout,
+                encoder_decoder_attention=True,
+            )
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
+        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+
+        self.onnx_trace = False
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def forward(self, input):
+        """
+        Args:
+            input (Tuple):
+                input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+                input[1] (Tensor): encoder output of shape `(batch, src_len, embed_dim)`
+                input[2] (ByteTensor/FloatTensor): encoder padding mask -
+                    binary ByteTensor of shape `(batch, src_len)` where padding elements
+                    are indicated by ``1``.
+        Returns:
+            output (Tuple):
+                output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)`
+                output[1] (ByteTensor/FloatTensor): encoder padding mask
+                output[2] (LongTensor): previous decoder outputs
+        """
+        # Note: incremental state is not yet supported
+        mt_task = False
+        if isinstance(input, tuple):
+            x = input[0]
+            encoder_out = input[1]
+            encoder_padding_mask = input[2]
+            incremental_state = None
+            mt_task = True
+        else:
+            x = input
+            encoder_out = None
+            encoder_padding_mask = None
+            incremental_state = None
+
+        if incremental_state is None:
+            self_attn_mask = self.buffered_future_mask(x)
+        else:
+            self_attn_mask = None
+
+        # TODO: add back prev_self_attn_state, prev_attn_state,
+        # self_attn_padding_mask
+        prev_self_attn_state = None
+        prev_attn_state = None
+        self_attn_padding_mask = None
+
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        if prev_self_attn_state is not None:
+            if incremental_state is None:
+                incremental_state = {}
+            prev_key, prev_value = prev_self_attn_state
+            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+
+        if self.encoder_attn is not None:
+            residual = x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
+            if prev_attn_state is not None:
+                if incremental_state is None:
+                    incremental_state = {}
+                prev_key, prev_value = prev_attn_state
+                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=(not self.training and self.need_attn),
+            )
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = residual + x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+
+        if mt_task:
+            return (x, encoder_out, encoder_padding_mask)
+        return x
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        if self._future_mask.size(0) < dim:
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
new file mode 100644
index 0000000000..7bb0c9ada8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
@@ -0,0 +1,789 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.model_parallel.models.pipeline_parallel_transformer.layers import (
+    Embedding,
+    TransformerDecoderEmbedding,
+    TransformerDecoderLayer,
+    TransformerDecoderOutputLayer,
+    TransformerEncoderEmbedding,
+    TransformerEncoderLayer,
+    TransformerEncoderLayerNorm,
+)
+from fairseq.models import (
+    BaseFairseqModel,
+    FairseqDecoder,
+    FairseqEncoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.models.transformer import (
+    base_architecture,
+    transformer_iwslt_de_en,
+    transformer_wmt_en_de_big,
+)
+from fairseq.modules import SinusoidalPositionalEmbedding
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+TORCH_PIPE = False
+RPC_INIT = False
+
+
+def import_pipe():
+    global TORCH_PIPE
+    global RPC_INIT
+    try:
+        from torch.distributed.pipeline.sync import Pipe  # noqa
+
+        global Pipe
+        from torch.distributed.pipeline.sync.utils import partition_model
+
+        global partition_model
+        from torch.distributed import rpc
+        import tempfile
+
+        TORCH_PIPE = True
+        # Initialize single process RPC agent since TORCH_PIPE requires
+        # RRef. RRef depends on RPC being initialized and as a result we initialize
+        # RPC with a single node.
+        tmpfile = tempfile.NamedTemporaryFile()
+        if not RPC_INIT:
+            rpc.init_rpc(
+                name="worker",
+                rank=0,
+                world_size=1,
+                rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
+                    init_method="file://{}".format(tmpfile.name),
+                ),
+            )
+            RPC_INIT = True
+        logger.info("Using torch pipe")
+    except ImportError:
+        try:
+            from fairscale.nn import Pipe  # noqa
+
+            logger.info("Using fairscale pipe")
+        except ImportError:
+            raise ImportError("Please install fairscale with: pip install fairscale")
+
+
+@register_model("pipeline_parallel_transformer")
+class PipelineParallelTransformerModel(BaseFairseqModel):
+    def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint):
+        import_pipe()
+        super().__init__()
+        assert isinstance(encoder, FairseqEncoder)
+        assert isinstance(decoder, FairseqDecoder)
+        encoder_module_list = (
+            [encoder.embedding_layer]
+            + list(encoder.encoder_layers)
+            + [encoder.final_layer_norm]
+        )
+        self.num_encoder_modules = len(encoder_module_list)
+        decoder_module_list = (
+            [decoder.embedding_layer]
+            + list(decoder.decoder_layers)
+            + [decoder.decoder_output_layer]
+        )
+        self.num_decoder_modules = len(decoder_module_list)
+        module_list = encoder_module_list + decoder_module_list
+        self.devices = devices
+        if TORCH_PIPE:
+            self.model = Pipe(
+                partition_model(nn.Sequential(*module_list), balance, devices),
+                chunks=chunks,
+                checkpoint=checkpoint,
+            )
+        else:
+            self.model = Pipe(
+                nn.Sequential(*module_list),
+                balance=balance,
+                devices=devices,
+                chunks=chunks,
+                checkpoint=checkpoint,
+            )
+        self.encoder_max_positions = self.max_positions_helper(
+            encoder.embedding_layer, "max_source_positions"
+        )
+        self.decoder_max_positions = self.max_positions_helper(
+            decoder.embedding_layer, "max_target_positions"
+        )
+        self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None)
+        # Note: To be populated during inference
+        self.encoder = None
+        self.decoder = None
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        if self.training:
+            input_lst = [src_tokens, src_lengths, prev_output_tokens]
+            input = tuple(i.to(self.devices[0], non_blocking=True) for i in input_lst)
+            if TORCH_PIPE:
+                return self.model(input).local_value()
+            else:
+                return self.model(input)
+        else:
+            assert self.encoder is not None and self.decoder is not None, (
+                "encoder and decoder need to be initialized by "
+                + "calling the `prepare_for_inference_()` method"
+            )
+            encoder_output_tuple = self.encoder(input)
+            return self.decoder(encoder_output_tuple)
+
+    def prepare_for_inference_(self, cfg):
+        if self.encoder is not None and self.decoder is not None:
+            logger.info("Encoder and Decoder already initialized")
+            return
+        encoder_module_list = []
+        decoder_module_list = []
+        module_count = 0
+        for partition in self.model.partitions:
+            for module in partition:
+                if module_count < self.num_encoder_modules:
+                    encoder_module_list.append(module)
+                else:
+                    decoder_module_list.append(module)
+                module_count += 1
+        self.model = None
+        self.encoder = TransformerEncoder(
+            cfg.distributed_training, None, None, encoder_module_list
+        )
+        self.decoder = TransformerDecoder(
+            cfg.distributed_training,
+            None,
+            None,
+            decoder_module_list=decoder_module_list,
+        )
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--activation-fn',
+                            choices=utils.get_available_activation_fns(),
+                            help='activation function to use')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after activation in FFN.')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion'),
+        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+                            help='sets adaptive softmax dropout for the tail projections')
+        parser.add_argument('--num-embedding-chunks', type=int, metavar='N', default=1,
+                            help='Number of embedding layer chunks (enables more even distribution'
+                                 'of optimizer states across data parallel nodes'
+                                 'when using optimizer state sharding and'
+                                 'a big embedding vocabulary)')
+        # fmt: on
+
+    @classmethod
+    def build_model_base(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not hasattr(args, "max_source_positions"):
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if not hasattr(args, "max_target_positions"):
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim, path=None, num_embed_chunks=1):
+            assert embed_dim % num_embed_chunks == 0, (
+                f"Number of embedding chunks = {num_embed_chunks} should be "
+                + f"divisible by the embedding dimension = {embed_dim}"
+            )
+            assert path is None or num_embed_chunks == 1, (
+                "Loading embedding from a path with number of embedding chunks > 1"
+                + " is not yet supported"
+            )
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            # if provided, load from preloaded dictionaries
+            if path:
+                emb = Embedding(num_embeddings, embed_dim, padding_idx)
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            else:
+                embed_chunk_dim = embed_dim // num_embed_chunks
+                emb = nn.ModuleList()
+                for i in range(num_embed_chunks):
+                    emb.append(Embedding(num_embeddings, embed_chunk_dim, padding_idx))
+            return emb
+
+        num_embed_chunks = args.num_embedding_chunks
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise ValueError("--share-all-embeddings requires a joined dictionary")
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = build_embedding(
+                src_dict,
+                args.encoder_embed_dim,
+                args.encoder_embed_path,
+                num_embed_chunks,
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            assert args.share_decoder_input_output_embed or num_embed_chunks == 1, (
+                "Not sharing decoder I/O embeddings is not yet supported with number of "
+                + "embedding chunks > 1"
+            )
+            encoder_embed_tokens = build_embedding(
+                src_dict,
+                args.encoder_embed_dim,
+                args.encoder_embed_path,
+                num_embed_chunks,
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict,
+                args.decoder_embed_dim,
+                args.decoder_embed_path,
+                num_embed_chunks,
+            )
+
+        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        return (encoder, decoder)
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerEncoder(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerDecoder(args, tgt_dict, embed_tokens)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder, decoder = cls.build_model_base(args, task)
+        return PipelineParallelTransformerModel(
+            encoder=encoder,
+            decoder=decoder,
+            balance=utils.eval_str_list(args.pipeline_balance, type=int),
+            devices=utils.eval_str_list(args.pipeline_devices, type=int),
+            chunks=args.pipeline_chunks,
+            checkpoint=args.pipeline_checkpoint,
+        )
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the default output size (typically vocabulary size)."""
+        return self.decoder.output_layer(features, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return (self.encoder_max_positions, self.decoder_max_positions)
+
+    def max_positions_helper(
+        self, embedding_layer, max_positions_field="max_source_positions"
+    ):
+        """Maximum input length supported by the encoder or decoder."""
+        if embedding_layer.embed_positions is None:
+            return getattr(embedding_layer, max_positions_field)
+        return min(
+            getattr(embedding_layer, max_positions_field),
+            embedding_layer.embed_positions.max_positions,
+        )
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
+            if sample is not None:
+                assert "target" in sample
+                target = sample["target"]
+            else:
+                target = None
+            out = self.adaptive_softmax.get_log_prob(net_output, target=target)
+            return out.exp_() if not log_probs else out
+
+        # A Pipe() module returns a tuple of tensors as the output.
+        # In this case, the tuple has one element - the output tensor of logits
+        logits = net_output if isinstance(net_output, torch.Tensor) else net_output[0]
+        if log_probs:
+            return utils.log_softmax(logits, dim=-1, onnx_trace=False)
+        else:
+            return utils.softmax(logits, dim=-1, onnx_trace=False)
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return self.decoder_max_positions
+
+    def load_state_dict(self, state_dict, strict=True, model_cfg=None):
+        """Copies parameters and buffers from *state_dict* into this module and
+        its descendants.
+
+        Overrides the method in :class:`nn.Module`. Compared with that method
+        this additionally "upgrades" *state_dicts* from old checkpoints.
+        """
+        self.upgrade_state_dict(state_dict)
+        is_regular_transformer = not any("model.partitions" in k for k in state_dict)
+        if is_regular_transformer:
+            state_dict = self.convert_to_pipeline_parallel_state_dict(state_dict)
+        return super().load_state_dict(state_dict, strict)
+
+    def convert_to_pipeline_parallel_state_dict(self, state_dict):
+        new_state_dict = self.state_dict()
+        encoder_layer_idx = 0
+        decoder_layer_idx = 0
+        encoder_key_suffixes = [
+            "self_attn.k_proj.weight",
+            "self_attn.k_proj.bias",
+            "self_attn.v_proj.weight",
+            "self_attn.v_proj.bias",
+            "self_attn.q_proj.weight",
+            "self_attn.q_proj.bias",
+            "self_attn.out_proj.weight",
+            "self_attn.out_proj.bias",
+            "self_attn_layer_norm.weight",
+            "self_attn_layer_norm.bias",
+            "fc1.weight",
+            "fc1.bias",
+            "fc2.weight",
+            "fc2.bias",
+            "final_layer_norm.weight",
+            "final_layer_norm.bias",
+        ]
+        decoder_key_suffixes = [
+            "self_attn.k_proj.weight",
+            "self_attn.k_proj.bias",
+            "self_attn.v_proj.weight",
+            "self_attn.v_proj.bias",
+            "self_attn.q_proj.weight",
+            "self_attn.q_proj.bias",
+            "self_attn.out_proj.weight",
+            "self_attn.out_proj.bias",
+            "self_attn_layer_norm.weight",
+            "self_attn_layer_norm.bias",
+            "encoder_attn.k_proj.weight",
+            "encoder_attn.k_proj.bias",
+            "encoder_attn.v_proj.weight",
+            "encoder_attn.v_proj.bias",
+            "encoder_attn.q_proj.weight",
+            "encoder_attn.q_proj.bias",
+            "encoder_attn.out_proj.weight",
+            "encoder_attn.out_proj.bias",
+            "encoder_attn_layer_norm.weight",
+            "encoder_attn_layer_norm.bias",
+            "fc1.weight",
+            "fc1.bias",
+            "fc2.weight",
+            "fc2.bias",
+            "final_layer_norm.weight",
+            "final_layer_norm.bias",
+        ]
+        for pid, partition in enumerate(self.model.partitions):
+            logger.info(f"Begin Partition {pid}")
+            for mid, module in enumerate(partition):
+                # fmt: off
+                if isinstance(module, TransformerEncoderEmbedding):
+                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['encoder.embed_tokens.weight']
+                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['encoder.embed_positions._float_tensor']
+                if isinstance(module, TransformerEncoderLayer):
+                    for suffix in encoder_key_suffixes:
+                        new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'encoder.layers.{encoder_layer_idx}.{suffix}']
+                    encoder_layer_idx += 1
+                if isinstance(module, TransformerDecoderLayer):
+                    for suffix in decoder_key_suffixes:
+                        new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'decoder.layers.{decoder_layer_idx}.{suffix}']
+                    decoder_layer_idx += 1
+                if isinstance(module, TransformerEncoderLayerNorm):
+                    if 'encoder.layer_norm.weight' in state_dict:
+                        new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.weight'] = state_dict['encoder.layer_norm.weight']
+                        new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.bias'] = state_dict['encoder.layer_norm.bias']
+                if isinstance(module, TransformerDecoderEmbedding):
+                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['decoder.embed_tokens.weight']
+                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['decoder.embed_positions._float_tensor']
+                if isinstance(module, TransformerDecoderOutputLayer):
+                    new_state_dict[f'model.partitions.{pid}.{mid}.output_projection.weight'] = state_dict['decoder.output_projection.weight']
+                # fmt: on
+        return new_state_dict
+
+
+class TransformerEncoder(FairseqEncoder):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None):
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        import_pipe()
+        self.use_pipeline = encoder_module_list is not None
+        if not self.use_pipeline:
+            self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens)
+            self.encoder_layers = nn.Sequential(
+                *[TransformerEncoderLayer(args) for i in range(args.encoder_layers)]
+            )
+            if isinstance(embed_tokens, nn.ModuleList):
+                emb_dim = sum(e.embedding_dim for e in embed_tokens)
+            else:
+                emb_dim = embed_tokens.embedding_dim
+            self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim)
+        else:
+            encoder_balance = utils.eval_str_list(
+                args.pipeline_encoder_balance, type=int
+            )
+            encoder_devices = utils.eval_str_list(
+                args.pipeline_encoder_devices, type=int
+            )
+            assert sum(encoder_balance) == len(encoder_module_list), (
+                f"Sum of encoder_balance={encoder_balance} is not equal "
+                + f"to num_encoder_modules={len(encoder_module_list)}"
+            )
+            if TORCH_PIPE:
+                self.model = Pipe(
+                    module=partition_model(
+                        nn.Sequential(*encoder_module_list),
+                        encoder_balance,
+                        encoder_devices,
+                    ),
+                    chunks=args.pipeline_chunks,
+                    checkpoint=args.pipeline_checkpoint,
+                )
+            else:
+                self.model = Pipe(
+                    module=nn.Sequential(*encoder_module_list),
+                    balance=encoder_balance,
+                    devices=encoder_devices,
+                    chunks=args.pipeline_chunks,
+                    checkpoint=args.pipeline_checkpoint,
+                )
+
+    def forward(self, src_tokens, src_lengths):
+        """
+        Args:
+            input_tuple(
+                src_tokens (LongTensor): tokens in the source language of shape
+                    `(batch, src_len)`
+                src_lengths (torch.LongTensor): lengths of each source sentence of
+                    shape `(batch)`
+            )
+
+        Returns:
+            output_tuple(
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - prev_output_tokens
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+            )
+        """
+        dummy_prev_output_tokens = torch.zeros(
+            1, dtype=src_tokens.dtype, device=src_tokens.device
+        )
+        input_tuple = (src_tokens, src_lengths, dummy_prev_output_tokens)
+        if self.use_pipeline:
+            input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple)
+            if TORCH_PIPE:
+                encoder_out = self.model(input_tuple).local_value()
+            else:
+                encoder_out = self.model(input_tuple)
+        else:
+            encoder_embed_output_tuple = self.embedding_layer(input_tuple)
+            encoder_layers_output = self.encoder_layers(encoder_embed_output_tuple)
+            encoder_out = self.final_layer_norm(encoder_layers_output)
+        # first element is the encoder output
+        # second element is the encoder padding mask
+        # the remaining elements of EncoderOut are not computed by
+        # the PipelineParallelTransformer
+        return EncoderOut(encoder_out[0], encoder_out[1], None, None, None, None)
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        if encoder_out.encoder_out is not None:
+            encoder_out = encoder_out._replace(
+                encoder_out=encoder_out.encoder_out.index_select(1, new_order)
+            )
+        if encoder_out.encoder_padding_mask is not None:
+            encoder_out = encoder_out._replace(
+                encoder_padding_mask=encoder_out.encoder_padding_mask.index_select(
+                    0, new_order
+                )
+            )
+        if encoder_out.encoder_embedding is not None:
+            encoder_out = encoder_out._replace(
+                encoder_embedding=encoder_out.encoder_embedding.index_select(
+                    0, new_order
+                )
+            )
+        if encoder_out.encoder_states is not None:
+            for idx, state in enumerate(encoder_out.encoder_states):
+                encoder_out.encoder_states[idx] = state.index_select(1, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embedding_layer.embed_positions is None:
+            return self.embedding_layer.max_source_positions
+        return min(
+            self.embedding_layer.max_source_positions,
+            self.embedding_layer.embed_positions.max_positions,
+        )
+
+
+class TransformerDecoder(FairseqDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        decoder_module_list=None,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        import_pipe()
+        self.use_pipeline = decoder_module_list is not None
+        if not self.use_pipeline:
+            self.embedding_layer = TransformerDecoderEmbedding(args, embed_tokens)
+            self.decoder_layers = nn.Sequential(
+                *[
+                    TransformerDecoderLayer(args, no_encoder_attn)
+                    for _ in range(args.decoder_layers)
+                ]
+            )
+            self.decoder_output_layer = TransformerDecoderOutputLayer(
+                args, embed_tokens, dictionary
+            )
+        else:
+            decoder_balance = utils.eval_str_list(
+                args.pipeline_decoder_balance, type=int
+            )
+            decoder_devices = utils.eval_str_list(
+                args.pipeline_decoder_devices, type=int
+            )
+            assert sum(decoder_balance) == len(decoder_module_list), (
+                f"Sum of decoder_balance={decoder_balance} is not equal "
+                + f"to num_decoder_modules={len(decoder_module_list)}"
+            )
+            if TORCH_PIPE:
+                self.model = Pipe(
+                    module=partition_model(
+                        nn.Sequential(*decoder_module_list),
+                        decoder_balance,
+                        decoder_devices,
+                    ),
+                    chunks=args.pipeline_chunks,
+                    checkpoint=args.pipeline_checkpoint,
+                )
+            else:
+                self.model = Pipe(
+                    module=nn.Sequential(*decoder_module_list),
+                    balance=decoder_balance,
+                    devices=decoder_devices,
+                    chunks=args.pipeline_chunks,
+                    checkpoint=args.pipeline_checkpoint,
+                )
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        input_tuple = (
+            encoder_out.encoder_out,
+            encoder_out.encoder_padding_mask,
+            prev_output_tokens,
+        )
+        if self.use_pipeline:
+            input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple)
+            if TORCH_PIPE:
+                return (self.model(input_tuple).local_value(),)
+            else:
+                return (self.model(input_tuple),)
+        else:
+            embed_layer_output = self.embedding_layer(input_tuple)
+            state = self.decoder_layers(embed_layer_output)
+            return (self.decoder_output_layer(state),)
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            if self.share_input_output_embed:
+                return F.linear(features, self.embed_tokens.weight)
+            else:
+                return F.linear(features, self.embed_out)
+        else:
+            return features
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embedding_layer.embed_positions is None:
+            return self.embedding_layer.max_target_positions
+        return min(
+            self.embedding_layer.max_target_positions,
+            self.embedding_layer.embed_positions.max_positions,
+        )
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+
+        for i in range(len(self.layers)):
+            # update layer norms
+            layer_norm_map = {
+                "0": "self_attn_layer_norm",
+                "1": "encoder_attn_layer_norm",
+                "2": "final_layer_norm",
+            }
+            for old, new in layer_norm_map.items():
+                for m in ("weight", "bias"):
+                    k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
+                    if k in state_dict:
+                        state_dict[
+                            "{}.layers.{}.{}.{}".format(name, i, new, m)
+                        ] = state_dict[k]
+                        del state_dict[k]
+
+        version_key = "{}.version".format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+
+        return state_dict
+
+
+@register_model_architecture(
+    "pipeline_parallel_transformer", "transformer_iwslt_de_en_pipeline_parallel"
+)
+def transformer_iwslt_de_en_dist(args):
+    transformer_iwslt_de_en(args)
+
+
+@register_model_architecture(
+    "pipeline_parallel_transformer", "transformer_wmt_en_de_big_pipeline_parallel"
+)
+def transformer_wmt_en_de_big_dist(args):
+    transformer_wmt_en_de_big(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
new file mode 100644
index 0000000000..cf3b2e8baf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch.nn as nn
+
+from fairseq.model_parallel.modules import (
+    ModelParallelTransformerDecoderLayer,
+    ModelParallelTransformerEncoderLayer,
+)
+from fairseq.models import register_model
+from fairseq.models.transformer import (
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+)
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        VocabParallelEmbedding,
+        copy_to_model_parallel_region,
+        gather_from_model_parallel_region,
+    )
+
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("model_parallel_transformer")
+class ModelParallelTransformerModel(TransformerModel):
+    """
+    Model parallel Transformer model.
+    """
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        if not has_megatron_submodule:
+            raise ImportError(
+                "\n\nPlease install the megatron submodule:"
+                "\n\n  git submodule update --init "
+                "fairseq/model_parallel/megatron"
+            )
+        dictionary.pad_to_multiple_(args.model_parallel_size * 8)
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+
+        def _vocab_init(tensor, **kwargs):
+            nn.init.normal_(tensor, mean=0, std=num_embeddings**-0.5)
+            nn.init.constant_(tensor[1], 0)
+
+        emb = VocabParallelEmbedding(
+            num_embeddings, embed_dim, padding_idx, init_method=_vocab_init
+        )
+        # if provided, load from preloaded dictionaries
+        if path:
+            raise NotImplementedError(
+                "Loading of embedding from path is not supported for model parallel"
+            )
+        return emb
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return ModelParallelTransformerEncoder(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return ModelParallelTransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=getattr(args, "no_cross_attention", False),
+        )
+
+
+class ModelParallelTransformerEncoder(TransformerEncoder):
+    """
+    Model parallel Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`ModelParallelTransformerEncoderLayer`.
+    """
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens)
+
+        if args.no_final_layer_norm:
+            self.layer_norm = None
+
+    def build_encoder_layer(self, args):
+        return ModelParallelTransformerEncoderLayer(args)
+
+
+class ModelParallelTransformerDecoder(TransformerDecoder):
+    """
+    Model Parallel Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`ModelParallelTransformerDecoderLayer`.
+    """
+
+    def build_decoder_layer(self, args, no_encoder_attn=False):
+        return ModelParallelTransformerDecoderLayer(args, no_encoder_attn)
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        if not self.share_input_output_embed:
+            raise NotImplementedError(
+                "Model parallel training currently requires --share-decoder-input-output-embed"
+            )
+
+        features = copy_to_model_parallel_region(features)
+
+        # project back to size of vocabulary
+        x = self.output_projection(features)
+
+        if getattr(self.args, "criterion") != "vocab_parallel_cross_entropy":
+            x = gather_from_model_parallel_region(x).contiguous()
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
new file mode 100644
index 0000000000..03e4dbe263
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
@@ -0,0 +1,169 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+
+from fairseq.model_parallel.models.transformer import ModelParallelTransformerDecoder
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer_lm import TransformerLanguageModel
+
+try:
+    from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding
+
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model("model_parallel_transformer_lm")
+class ModelParallelTransformerLanguageModel(TransformerLanguageModel):
+    @staticmethod
+    def add_args(parser):
+        TransformerLanguageModel.add_args(parser)
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        if not has_megatron_submodule:
+            raise ImportError(
+                "\n\nPlease install the megatron submodule:"
+                "\n\n  git submodule update --init "
+                "fairseq/model_parallel/megatron"
+            )
+
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
+        task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
+
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = getattr(
+                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+            )
+
+        if args.character_embeddings:
+            raise NotImplementedError(
+                "Character embeddings is not supported for model parallel"
+            )
+        elif args.adaptive_input:
+            raise NotImplementedError(
+                "Adaptive input is not supported for model parallel"
+            )
+        else:
+            embed_tokens = cls.build_embedding(
+                args, task.source_dictionary, args.decoder_input_dim
+            )
+
+        decoder = ModelParallelTransformerDecoder(
+            args,
+            task.target_dictionary,
+            embed_tokens,
+            no_encoder_attn=True,
+        )
+        return cls(decoder)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        def _vocab_init(tensor, **kwargs):
+            nn.init.normal_(tensor, mean=0, std=embed_dim**-0.5)
+            nn.init.constant_(tensor[1], 0)
+
+        embed_tokens = VocabParallelEmbedding(
+            len(dictionary), embed_dim, dictionary.pad(), init_method=_vocab_init
+        )
+        return embed_tokens
+
+
+def base_lm_architecture(args):
+    # backward compatibility for older model checkpoints
+    if hasattr(args, "no_tie_adaptive_proj"):
+        # previous models defined --no-tie-adaptive-proj, so use the existence of
+        # that option to determine if this is an "old" model checkpoint
+        args.no_decoder_final_norm = True  # old models always set this to True
+        if args.no_tie_adaptive_proj is False:
+            args.tie_adaptive_proj = True
+    if hasattr(args, "decoder_final_norm"):
+        args.no_decoder_final_norm = not args.decoder_final_norm
+
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    # Model training is not stable without this
+    args.decoder_normalize_before = True
+    args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.character_embeddings = getattr(args, "character_embeddings", False)
+    args.character_filters = getattr(
+        args,
+        "character_filters",
+        "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
+    )
+    args.character_embedding_dim = getattr(args, "character_embedding_dim", 4)
+    args.char_embedder_highway_layers = getattr(args, "char_embedder_highway_layers", 2)
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
+    args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0.0)
+    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0.0)
+    args.add_bos_token = getattr(args, "add_bos_token", False)
+
+
+@register_model_architecture("model_parallel_transformer_lm", "transformer_lm_megatron")
+def transformer_lm_megatron(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 3072)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072 * 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 72)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture(
+    "model_parallel_transformer_lm", "transformer_lm_megatron_11b"
+)
+def transformer_lm_megatron_11b(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 3072)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072 * 6)
+    args.decoder_layers = getattr(args, "decoder_layers", 72)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
new file mode 100644
index 0000000000..11603217a1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+from .multihead_attention import ModelParallelMultiheadAttention
+from .transformer_layer import (
+    ModelParallelTransformerEncoderLayer,
+    ModelParallelTransformerDecoderLayer,
+)
+
+__all__ = [
+    "ModelParallelMultiheadAttention",
+    "ModelParallelTransformerEncoderLayer",
+    "ModelParallelTransformerDecoderLayer",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
new file mode 100644
index 0000000000..bbea450950
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
@@ -0,0 +1,349 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        ColumnParallelLinear,
+        RowParallelLinear,
+        get_cuda_rng_tracker,
+        get_model_parallel_world_size,
+    )
+
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+@with_incremental_state
+class ModelParallelMultiheadAttention(nn.Module):
+    """Model parallel Multi-headed attention.
+    This performs the Multi-headed attention over multiple gpus.
+
+    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        self_attention=False,
+        encoder_decoder_attention=False,
+    ):
+        super().__init__()
+        if not has_megatron_submodule:
+            raise ImportError(
+                "\n\nPlease install the megatron submodule:"
+                "\n\n  git submodule update --init "
+                "fairseq/model_parallel/megatron"
+            )
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.model_parallel_size = get_model_parallel_world_size()
+
+        self.num_heads_partition = num_heads // self.model_parallel_size
+        assert (
+            self.num_heads_partition * self.model_parallel_size == num_heads
+        ), "Number of heads must be divisible by model parallel size"
+
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert (
+            not self.self_attention or self.qkv_same_dim
+        ), "Self-attention requires query, key and value to be of the same size"
+
+        self.k_proj = ColumnParallelLinear(
+            self.kdim, embed_dim, bias=bias, gather_output=False
+        )
+        self.v_proj = ColumnParallelLinear(
+            self.vdim, embed_dim, bias=bias, gather_output=False
+        )
+        self.q_proj = ColumnParallelLinear(
+            embed_dim, embed_dim, bias=bias, gather_output=False
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim, embed_dim, bias=bias, input_is_parallel=True
+        )
+
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        **unused_kwargs,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+        """
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+
+        is_tpu = query.device.type == "xla"
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads_partition, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads_partition, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads_partition, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads_partition, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(
+                    bsz * self.num_heads_partition, -1, self.head_dim
+                )
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(
+                    bsz * self.num_heads_partition, -1, self.head_dim
+                )
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = (
+                ModelParallelMultiheadAttention._append_prev_key_padding_mask(
+                    key_padding_mask=key_padding_mask,
+                    prev_key_padding_mask=prev_key_padding_mask,
+                    batch_size=bsz,
+                    src_len=k.size(1),
+                    static_kv=static_kv,
+                )
+            )
+
+            saved_state["prev_key"] = k.view(
+                bsz, self.num_heads_partition, -1, self.head_dim
+            )
+            saved_state["prev_value"] = v.view(
+                bsz, self.num_heads_partition, -1, self.head_dim
+            )
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+
+        assert list(attn_weights.size()) == [
+            bsz * self.num_heads_partition,
+            tgt_len,
+            src_len,
+        ]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(
+                bsz, self.num_heads_partition, tgt_len, src_len
+            )
+            if not is_tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"),
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(
+                bsz * self.num_heads_partition, tgt_len, src_len
+            )
+
+        attn_weights_float = utils.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+
+        with get_cuda_rng_tracker().fork():
+            attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [
+            bsz * self.num_heads_partition,
+            tgt_len,
+            self.head_dim,
+        ]
+        embed_dim_partition = embed_dim // self.model_parallel_size
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim_partition)
+        attn = self.out_proj(attn)
+        # return attn_weights None to keep the return type same as single gpu multihead attention
+        # This will be deprecated.
+        attn_weights: Optional[Tensor] = None
+
+        return attn, attn_weights
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+
+            filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1))
+            if prev_key_padding_mask.is_cuda:
+                filler = filler.cuda()
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(batch_size, src_len - key_padding_mask.size(1))
+            if key_padding_mask.is_cuda:
+                filler = filler.cuda()
+            new_key_padding_mask = torch.cat(
+                [filler.float(), key_padding_mask.float()], dim=1
+            )
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    def reorder_incremental_state(
+        self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                if input_buffer[k] is not None:
+                    input_buffer[k] = input_buffer[k].index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
new file mode 100644
index 0000000000..7ab53c6e5f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
+from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer
+
+
+try:
+    from fairseq.model_parallel.megatron.mpu import (
+        ColumnParallelLinear,
+        RowParallelLinear,
+    )
+
+    has_megatron_submodule = True
+except (ImportError, ModuleNotFoundError):
+    has_megatron_submodule = False
+
+
+class ModelParallelTransformerEncoderLayer(TransformerEncoderLayer):
+    """Encoder layer block over multiple gpus.
+
+    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
+    """
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
+
+    def build_self_attention(self, embed_dim, args, **unused_kwargs):
+        return ModelParallelMultiheadAttention(
+            embed_dim,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+        )
+
+
+class ModelParallelTransformerDecoderLayer(TransformerDecoderLayer):
+    """Decoder layer block.
+
+    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
+    """
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        if q_noise > 0:
+            raise NotImplementedError
+        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
+
+    def build_self_attention(self, embed_dim, args, **unused_kwargs):
+        return ModelParallelMultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=not getattr(args, "cross_self_attention", False),
+        )
+
+    def build_encoder_attention(self, embed_dim, args, **unused_kwargs):
+        return ModelParallelMultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=args.decoder_attention_heads,
+            kdim=getattr(args, "encoder_embed_dim", None),
+            vdim=getattr(args, "encoder_embed_dim", None),
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
new file mode 100644
index 0000000000..11cf6ee530
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
@@ -0,0 +1,236 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import argparse
+import importlib
+import os
+
+from contextlib import ExitStack
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import merge_with_parent
+from hydra.core.config_store import ConfigStore
+from omegaconf import open_dict, OmegaConf
+
+from .composite_encoder import CompositeEncoder
+from .distributed_fairseq_model import DistributedFairseqModel
+from .fairseq_decoder import FairseqDecoder
+from .fairseq_encoder import FairseqEncoder
+from .fairseq_incremental_decoder import FairseqIncrementalDecoder
+from .fairseq_model import (
+    BaseFairseqModel,
+    FairseqEncoderDecoderModel,
+    FairseqEncoderModel,
+    FairseqLanguageModel,
+    FairseqModel,
+    FairseqMultiModel,
+)
+
+
+MODEL_REGISTRY = {}
+MODEL_DATACLASS_REGISTRY = {}
+ARCH_MODEL_REGISTRY = {}
+ARCH_MODEL_NAME_REGISTRY = {}
+ARCH_MODEL_INV_REGISTRY = {}
+ARCH_CONFIG_REGISTRY = {}
+
+
+__all__ = [
+    "BaseFairseqModel",
+    "CompositeEncoder",
+    "DistributedFairseqModel",
+    "FairseqDecoder",
+    "FairseqEncoder",
+    "FairseqEncoderDecoderModel",
+    "FairseqEncoderModel",
+    "FairseqIncrementalDecoder",
+    "FairseqLanguageModel",
+    "FairseqModel",
+    "FairseqMultiModel",
+]
+
+
+def build_model(cfg: FairseqDataclass, task, from_checkpoint=False):
+
+    model = None
+    model_type = getattr(cfg, "_name", None) or getattr(cfg, "arch", None)
+
+    if not model_type and len(cfg) == 1:
+        # this is hit if config object is nested in directory that is named after model type
+
+        model_type = next(iter(cfg))
+        if model_type in MODEL_DATACLASS_REGISTRY:
+            cfg = cfg[model_type]
+        else:
+            raise Exception(
+                "Could not infer model type from directory. Please add _name field to indicate model type. "
+                "Available models: "
+                + str(MODEL_DATACLASS_REGISTRY.keys())
+                + " Requested model type: "
+                + model_type
+            )
+
+    if model_type in ARCH_MODEL_REGISTRY:
+        # case 1: legacy models
+        model = ARCH_MODEL_REGISTRY[model_type]
+    elif model_type in MODEL_DATACLASS_REGISTRY:
+        # case 2: config-driven models
+        model = MODEL_REGISTRY[model_type]
+
+    if model_type in MODEL_DATACLASS_REGISTRY:
+        # set defaults from dataclass. note that arch name and model name can be the same
+        dc = MODEL_DATACLASS_REGISTRY[model_type]
+
+        if isinstance(cfg, argparse.Namespace):
+            cfg = dc.from_namespace(cfg)
+        else:
+            cfg = merge_with_parent(dc(), cfg, from_checkpoint)
+    else:
+        if model_type in ARCH_CONFIG_REGISTRY:
+            with open_dict(cfg) if OmegaConf.is_config(cfg) else ExitStack():
+                # this calls the different "arch" functions (like base_architecture()) that you indicate
+                # if you specify --arch on the command line. this is only applicable to the old argparse based models
+                # hydra models should expose different architectures via different config files
+                # it will modify the cfg object and default parameters according to the arch
+                ARCH_CONFIG_REGISTRY[model_type](cfg)
+
+    assert model is not None, (
+        f"Could not infer model type from {cfg}. "
+        "Available models: {}".format(MODEL_DATACLASS_REGISTRY.keys())
+        + f" Requested model type: {model_type}"
+    )
+
+    return model.build_model(cfg, task)
+
+
+def register_model(name, dataclass=None):
+    """
+    New model types can be added to fairseq with the :func:`register_model`
+    function decorator.
+
+    For example::
+
+        @register_model('lstm')
+        class LSTM(FairseqEncoderDecoderModel):
+            (...)
+
+    .. note:: All models must implement the :class:`BaseFairseqModel` interface.
+        Typically you will extend :class:`FairseqEncoderDecoderModel` for
+        sequence-to-sequence tasks or :class:`FairseqLanguageModel` for
+        language modeling tasks.
+
+    Args:
+        name (str): the name of the model
+    """
+
+    def register_model_cls(cls):
+        if name in MODEL_REGISTRY:
+            return MODEL_REGISTRY[name]
+
+        if not issubclass(cls, BaseFairseqModel):
+            raise ValueError(
+                "Model ({}: {}) must extend BaseFairseqModel".format(name, cls.__name__)
+            )
+        MODEL_REGISTRY[name] = cls
+        if dataclass is not None and not issubclass(dataclass, FairseqDataclass):
+            raise ValueError(
+                "Dataclass {} must extend FairseqDataclass".format(dataclass)
+            )
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            MODEL_DATACLASS_REGISTRY[name] = dataclass
+
+            cs = ConfigStore.instance()
+            node = dataclass()
+            node._name = name
+            cs.store(name=name, group="model", node=node, provider="fairseq")
+
+            @register_model_architecture(name, name)
+            def noop(_):
+                pass
+
+        return cls
+
+    return register_model_cls
+
+
+def register_model_architecture(model_name, arch_name):
+    """
+    New model architectures can be added to fairseq with the
+    :func:`register_model_architecture` function decorator. After registration,
+    model architectures can be selected with the ``--arch`` command-line
+    argument.
+
+    For example::
+
+        @register_model_architecture('lstm', 'lstm_luong_wmt_en_de')
+        def lstm_luong_wmt_en_de(cfg):
+            args.encoder_embed_dim = getattr(cfg.model, 'encoder_embed_dim', 1000)
+            (...)
+
+    The decorated function should take a single argument *cfg*, which is a
+    :class:`omegaconf.DictConfig`. The decorated function should modify these
+    arguments in-place to match the desired architecture.
+
+    Args:
+        model_name (str): the name of the Model (Model must already be
+            registered)
+        arch_name (str): the name of the model architecture (``--arch``)
+    """
+
+    def register_model_arch_fn(fn):
+        if model_name not in MODEL_REGISTRY:
+            raise ValueError(
+                "Cannot register model architecture for unknown model type ({})".format(
+                    model_name
+                )
+            )
+        if arch_name in ARCH_MODEL_REGISTRY:
+            raise ValueError(
+                "Cannot register duplicate model architecture ({})".format(arch_name)
+            )
+        if not callable(fn):
+            raise ValueError(
+                "Model architecture must be callable ({})".format(arch_name)
+            )
+        ARCH_MODEL_REGISTRY[arch_name] = MODEL_REGISTRY[model_name]
+        ARCH_MODEL_NAME_REGISTRY[arch_name] = model_name
+        ARCH_MODEL_INV_REGISTRY.setdefault(model_name, []).append(arch_name)
+        ARCH_CONFIG_REGISTRY[arch_name] = fn
+        return fn
+
+    return register_model_arch_fn
+
+
+def import_models(models_dir, namespace):
+    for file in os.listdir(models_dir):
+        path = os.path.join(models_dir, file)
+        if (
+            not file.startswith("_")
+            and not file.startswith(".")
+            and (file.endswith(".py") or os.path.isdir(path))
+        ):
+            model_name = file[: file.find(".py")] if file.endswith(".py") else file
+            importlib.import_module(namespace + "." + model_name)
+
+            # extra `model_parser` for sphinx
+            if model_name in MODEL_REGISTRY:
+                parser = argparse.ArgumentParser(add_help=False)
+                group_archs = parser.add_argument_group("Named architectures")
+                group_archs.add_argument(
+                    "--arch", choices=ARCH_MODEL_INV_REGISTRY[model_name]
+                )
+                group_args = parser.add_argument_group(
+                    "Additional command-line arguments"
+                )
+                MODEL_REGISTRY[model_name].add_args(group_args)
+                globals()[model_name + "_parser"] = parser
+
+
+# automatically import any Python files in the models/ directory
+models_dir = os.path.dirname(__file__)
+import_models(models_dir, "fairseq.models")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
new file mode 100644
index 0000000000..a701923f7e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .hub_interface import *  # noqa
+from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
new file mode 100644
index 0000000000..6b647c9642
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
@@ -0,0 +1,211 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+from typing import Dict, List
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.data import encoders
+from fairseq.hub_utils import GeneratorHubInterface
+from omegaconf import open_dict
+
+
+logger = logging.getLogger(__name__)
+
+
+class BARTHubInterface(GeneratorHubInterface):
+    """A simple PyTorch Hub interface to BART.
+
+    Usage: https://github.com/pytorch/fairseq/tree/main/examples/bart
+    """
+
+    def __init__(self, cfg, task, model):
+        super().__init__(cfg, task, [model])
+        self.model = self.models[0]
+
+    def encode(
+        self, sentence: str, *addl_sentences, no_separator=True
+    ) -> torch.LongTensor:
+        """
+        BPE-encode a sentence (or multiple sentences).
+
+        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
+        Every sentence ends with an end-of-sentence (`</s>`).
+
+        Example (single sentence): `<s> a b c </s>`
+        Example (sentence pair): `<s> d e f </s> 1 2 3 </s>`
+
+        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+        requires leading spaces. For example::
+
+            >>> bart.encode('Hello world').tolist()
+            [0, 31414, 232, 2]
+            >>> bart.encode(' world').tolist()
+            [0, 232, 2]
+            >>> bart.encode('world').tolist()
+            [0, 8331, 2]
+        """
+        tokens = self.bpe.encode(sentence)
+        if len(tokens.split(" ")) > min(self.max_positions) - 2:
+            tokens = " ".join(tokens.split(" ")[: min(self.max_positions) - 2])
+        bpe_sentence = "<s> " + tokens + " </s>"
+        for s in addl_sentences:
+            bpe_sentence += " </s>" if not no_separator else ""
+            bpe_sentence += " " + self.bpe.encode(s) + " </s>"
+        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
+        return tokens.long()
+
+    def decode(self, tokens: torch.LongTensor):
+        assert tokens.dim() == 1
+        tokens = tokens.cpu().numpy()
+        if tokens[0] == self.task.source_dictionary.bos():
+            tokens = tokens[1:]  # remove <s>
+        eos_mask = tokens == self.task.source_dictionary.eos()
+        doc_mask = eos_mask[1:] & eos_mask[:-1]
+        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
+        sentences = [
+            self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences
+        ]
+        if len(sentences) == 1:
+            return sentences[0]
+        return sentences
+
+    def _build_sample(self, src_tokens: List[torch.LongTensor]):
+        # assert torch.is_tensor(src_tokens)
+        dataset = self.task.build_dataset_for_inference(
+            src_tokens,
+            [x.numel() for x in src_tokens],
+        )
+        sample = dataset.collater(dataset)
+        sample = utils.apply_to_sample(lambda tensor: tensor.to(self.device), sample)
+        return sample
+
+    def generate(
+        self,
+        tokenized_sentences: List[torch.LongTensor],
+        *args,
+        inference_step_args=None,
+        skip_invalid_size_inputs=False,
+        **kwargs
+    ) -> List[List[Dict[str, torch.Tensor]]]:
+        inference_step_args = inference_step_args or {}
+        if "prefix_tokens" in inference_step_args:
+            raise NotImplementedError("prefix generation not implemented for BART")
+        res = []
+        for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs):
+            src_tokens = batch["net_input"]["src_tokens"]
+            inference_step_args["prefix_tokens"] = src_tokens.new_full(
+                (src_tokens.size(0), 1), fill_value=self.task.source_dictionary.bos()
+            ).to(device=self.device)
+            results = super().generate(
+                src_tokens,
+                *args,
+                inference_step_args=inference_step_args,
+                skip_invalid_size_inputs=skip_invalid_size_inputs,
+                **kwargs
+            )
+            for id, hypos in zip(batch["id"].tolist(), results):
+                res.append((id, hypos))
+        res = [hypos for _, hypos in sorted(res, key=lambda x: x[0])]
+        return res
+
+    def extract_features(
+        self, tokens: torch.LongTensor, return_all_hiddens: bool = False
+    ) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        if tokens.size(-1) > min(self.model.max_positions()):
+            raise ValueError(
+                "tokens exceeds maximum length: {} > {}".format(
+                    tokens.size(-1), self.model.max_positions()
+                )
+            )
+        tokens.to(device=self.device),
+        prev_output_tokens = tokens.clone()
+
+        prev_output_tokens[:, 0] = tokens.gather(
+            1,
+            (tokens.ne(self.task.source_dictionary.pad()).sum(dim=1) - 1).unsqueeze(-1),
+        ).squeeze()
+
+        prev_output_tokens[:, 1:] = tokens[:, :-1]
+        features, extra = self.model(
+            src_tokens=tokens,
+            src_lengths=None,
+            prev_output_tokens=prev_output_tokens,
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra["inner_states"]
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
+
+    def register_classification_head(
+        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+    ):
+        self.model.register_classification_head(
+            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+        )
+
+    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        features = self.extract_features(tokens.to(device=self.device))
+        sentence_representation = features[
+            tokens.eq(self.task.source_dictionary.eos()), :
+        ].view(features.size(0), -1, features.size(-1))[:, -1, :]
+
+        logits = self.model.classification_heads[head](sentence_representation)
+        if return_logits:
+            return logits
+        return F.log_softmax(logits, dim=-1)
+
+    def fill_mask(
+        self,
+        masked_inputs: List[str],
+        topk: int = 5,
+        match_source_len: bool = True,
+        **generate_kwargs
+    ):
+        masked_token = "<mask>"
+        batch_tokens = []
+        for masked_input in masked_inputs:
+            assert (
+                masked_token in masked_input
+            ), "please add one {} token for the input".format(masked_token)
+
+            text_spans = masked_input.split(masked_token)
+            text_spans_bpe = (
+                (" {0} ".format(masked_token))
+                .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans])
+                .strip()
+            )
+            tokens = self.task.source_dictionary.encode_line(
+                "<s> " + text_spans_bpe + " </s>",
+                append_eos=False,
+                add_if_not_exist=False,
+            ).long()
+            batch_tokens.append(tokens)
+
+        # ensure beam size is at least as big as topk
+        generate_kwargs["beam"] = max(
+            topk,
+            generate_kwargs.get("beam", -1),
+        )
+        generate_kwargs["match_source_len"] = match_source_len
+        batch_hypos = self.generate(batch_tokens, **generate_kwargs)
+
+        return [
+            [(self.decode(hypo["tokens"]), hypo["score"]) for hypo in hypos[:topk]]
+            for hypos in batch_hypos
+        ]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
new file mode 100644
index 0000000000..e3670c0a2c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
@@ -0,0 +1,394 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+BART: Denoising Sequence-to-Sequence Pre-training for
+Natural Language Generation, Translation, and Comprehension
+"""
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from fairseq import utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import TransformerModel
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+from .hub_interface import BARTHubInterface
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("bart")
+class BARTModel(TransformerModel):
+    __jit_unused_properties__ = ["supported_targets"]
+
+    @classmethod
+    def hub_models(cls):
+        return {
+            "bart.base": "http://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz",
+            "bart.large": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz",
+            "bart.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz",
+            "bart.large.cnn": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz",
+            "bart.large.xsum": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz",
+        }
+
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+        if hasattr(self.encoder, "dictionary"):
+            self.eos: int = self.encoder.dictionary.eos()
+
+    @staticmethod
+    def add_args(parser):
+        super(BARTModel, BARTModel).add_args(parser)
+        parser.add_argument(
+            "--pooler-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability in the masked_lm pooler layers",
+        )
+        parser.add_argument(
+            "--pooler-activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use for pooler layer",
+        )
+        parser.add_argument(
+            "--spectral-norm-classification-head",
+            action="store_true",
+            help="Apply spectral normalization on the classification head",
+        )
+
+    @property
+    def supported_targets(self):
+        return {"self"}
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        features_only: bool = False,
+        classification_head_name: Optional[str] = None,
+        token_embeddings: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = True,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            token_embeddings=token_embeddings,
+            return_all_hiddens=return_all_hiddens,
+        )
+        x, extra = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens,
+        )
+        eos: int = self.eos
+        if classification_head_name is not None:
+            sentence_representation = x[src_tokens.eq(eos), :].view(
+                x.size(0), -1, x.size(-1)
+            )[:, -1, :]
+            for k, head in self.classification_heads.items():
+                # for torch script only supports iteration
+                if k == classification_head_name:
+                    x = head(sentence_representation)
+                    break
+        return x, extra
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        bpe="gpt2",
+        sample_break_mode="eos",
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            sample_break_mode=sample_break_mode,
+            **kwargs,
+        )
+        return BARTHubInterface(x["args"], x["task"], x["models"][0])
+
+    def register_classification_head(
+        self, name, num_classes=None, inner_dim=None, **kwargs
+    ):
+        """Register a classification head."""
+        logger.info("Registering classification head: {0}".format(name))
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    "and inner_dim {} (prev: {})".format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = BARTClassificationHead(
+            input_dim=self.args.encoder_embed_dim,
+            inner_dim=inner_dim or self.args.encoder_embed_dim,
+            num_classes=num_classes,
+            activation_fn=self.args.pooler_activation_fn,
+            pooler_dropout=self.args.pooler_dropout,
+            do_spectral_norm=getattr(
+                self.args, "spectral_norm_classification_head", False
+            ),
+        )
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+
+        prefix = name + "." if name != "" else ""
+        current_head_names = (
+            []
+            if not hasattr(self, "classification_heads")
+            else self.classification_heads.keys()
+        )
+
+        # Handle new classification heads present in the state dict.
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + "classification_heads."):
+                continue
+
+            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
+            num_classes = state_dict[
+                prefix + "classification_heads." + head_name + ".out_proj.weight"
+            ].size(0)
+            inner_dim = state_dict[
+                prefix + "classification_heads." + head_name + ".dense.weight"
+            ].size(0)
+
+            if getattr(self.args, "load_checkpoint_heads", False):
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "not present in current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes
+                    != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim
+                    != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "with different dimensions than current model: {}".format(
+                            head_name, k
+                        )
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        def truncate_emb(key):
+            if key in state_dict:
+                state_dict[key] = state_dict[key][:-1, :]
+
+        # When finetuning on translation task, remove last row of
+        # embedding matrix that corresponds to mask_idx token.
+        loaded_dict_size = state_dict["encoder.embed_tokens.weight"].size(0)
+        if (
+            loaded_dict_size == len(self.encoder.dictionary) + 1
+            and "<mask>" not in self.encoder.dictionary
+        ):
+            truncate_emb("encoder.embed_tokens.weight")
+            truncate_emb("decoder.embed_tokens.weight")
+            truncate_emb("encoder.output_projection.weight")
+            truncate_emb("decoder.output_projection.weight")
+
+        # When continued pretraining on new set of languages for mbart,
+        # add extra lang embeddings at the end of embed_tokens.
+        # Note: newly added languages are assumed to have been added at the end.
+        if self.args.task == "multilingual_denoising" and loaded_dict_size < len(
+            self.encoder.dictionary
+        ):
+            logger.info(
+                "Adding extra language embeddings not found in pretrained model for "
+                "continued pretraining of MBART on new set of languages."
+            )
+            loaded_mask_token_embedding = state_dict["encoder.embed_tokens.weight"][
+                -1, :
+            ]
+
+            num_langids_to_add = len(self.encoder.dictionary) - loaded_dict_size
+            embed_dim = state_dict["encoder.embed_tokens.weight"].size(1)
+
+            new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim)
+            nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim**-0.5)
+            new_lang_embed_to_add = new_lang_embed_to_add.to(
+                dtype=state_dict["encoder.embed_tokens.weight"].dtype,
+            )
+
+            state_dict["encoder.embed_tokens.weight"] = torch.cat(
+                [
+                    state_dict["encoder.embed_tokens.weight"][
+                        : loaded_dict_size - 1, :
+                    ],
+                    new_lang_embed_to_add,
+                    loaded_mask_token_embedding.unsqueeze(0),
+                ]
+            )
+            state_dict["decoder.embed_tokens.weight"] = torch.cat(
+                [
+                    state_dict["decoder.embed_tokens.weight"][
+                        : loaded_dict_size - 1, :
+                    ],
+                    new_lang_embed_to_add,
+                    loaded_mask_token_embedding.unsqueeze(0),
+                ]
+            )
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if hasattr(self, "classification_heads"):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + "classification_heads." + k not in state_dict:
+                    logger.info("Overwriting " + prefix + "classification_heads." + k)
+                    state_dict[prefix + "classification_heads." + k] = v
+
+    def set_beam_size(self, beam):
+        """Set beam size for efficient beamable enc-dec attention."""
+        beamable = False
+        for layer in self.decoder.layers:
+            if layer.encoder_attn is not None:
+                if hasattr(layer.encoder_attn, "set_beam_size"):
+                    layer.encoder_attn.set_beam_size(beam)
+                    beamable = True
+        if beamable:
+            self.encoder.reorder_encoder_out = self.encoder._reorder_encoder_out
+
+
+class BARTClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        activation_fn,
+        pooler_dropout,
+        do_spectral_norm=False,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+        if do_spectral_norm:
+            self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
+
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation_fn(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@register_model_architecture("bart", "bart_large")
+def bart_large_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 1024)
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.max_target_positions = getattr(args, "max_target_positions", 1024)
+    args.max_source_positions = getattr(args, "max_source_positions", 1024)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", True
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", True)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
+
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
+
+
+@register_model_architecture("bart", "bart_base")
+def bart_base_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 768)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12)
+    bart_large_architecture(args)
+
+
+@register_model_architecture("bart", "mbart_large")
+def mbart_large_architecture(args):
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    bart_large_architecture(args)
+
+
+@register_model_architecture("bart", "mbart_base")
+def mbart_base_architecture(args):
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    bart_base_architecture(args)
+
+
+@register_model_architecture("bart", "mbart_base_wmt20")
+def mbart_base_wmt20_architecture(args):
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    mbart_base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
new file mode 100644
index 0000000000..4e20fe3a83
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .fairseq_encoder import FairseqEncoder
+
+
+class CompositeEncoder(FairseqEncoder):
+    """
+    A wrapper around a dictionary of :class:`FairseqEncoder` objects.
+
+    We run forward on each encoder and return a dictionary of outputs. The first
+    encoder's dictionary is used for initialization.
+
+    Args:
+        encoders (dict): a dictionary of :class:`FairseqEncoder` objects.
+    """
+
+    def __init__(self, encoders):
+        super().__init__(next(iter(encoders.values())).dictionary)
+        self.encoders = encoders
+        for key in self.encoders:
+            self.add_module(key, self.encoders[key])
+
+    def forward(self, src_tokens, src_lengths):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of shape
+                `(batch)`
+
+        Returns:
+            dict:
+                the outputs from each Encoder
+        """
+        encoder_out = {}
+        for key in self.encoders:
+            encoder_out[key] = self.encoders[key](src_tokens, src_lengths)
+        return encoder_out
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """Reorder encoder output according to new_order."""
+        for key in self.encoders:
+            encoder_out[key] = self.encoders[key].reorder_encoder_out(
+                encoder_out[key], new_order
+            )
+        return encoder_out
+
+    def max_positions(self):
+        return min(self.encoders[key].max_positions() for key in self.encoders)
+
+    def upgrade_state_dict(self, state_dict):
+        for key in self.encoders:
+            self.encoders[key].upgrade_state_dict(state_dict)
+        return state_dict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
new file mode 100644
index 0000000000..fd76bcd4bf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
@@ -0,0 +1,147 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import signal
+import threading
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+
+from fairseq.distributed import (
+    DistributedTimeoutWrapper,
+    LegacyDistributedDataParallel,
+    ModuleProxyWrapper,
+    TPUDistributedDataParallel,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_SLOWMO_DDP_DISABLED = False
+try:
+    from fairscale.experimental.nn.data_parallel import (
+        SlowMoBaseAlgorithm,
+        SlowMoDistributedDataParallel,
+    )
+except ImportError:
+    _SLOWMO_DDP_DISABLED = True
+
+
+def DistributedFairseqModel(args, model, process_group, device):
+    """
+    Wrap a *model* to support distributed data parallel training.
+
+    This is similar to the built-in DistributedDataParallel, but allows
+    additional configuration of the DistributedDataParallel class to
+    use, and also provides easier access to the wrapped model by
+    forwarding requests for missing attributes to the wrapped model.
+
+    Args:
+        args (argparse.Namespace): fairseq args
+        model (BaseFairseqModel): model to wrap
+        process_group: the c10d process group to be used for distributed data
+            parallel all-reduction.
+        device: device to move model to
+    """
+    assert isinstance(model, nn.Module)
+    if args.tpu:
+        wrapped_model = TPUDistributedDataParallel(
+            module=model.to(device),
+            process_group=process_group,
+        )
+        # forward missing getattr and state_dict/load_state_dict to orig model
+        wrapped_model = ModuleProxyWrapper(wrapped_model)
+    elif args.ddp_backend in {"c10d", "pytorch_ddp"}:
+        wrapped_model = DistributedDataParallel(
+            module=model.to(device),
+            device_ids=[args.device_id],
+            output_device=args.device_id,
+            broadcast_buffers=args.broadcast_buffers,
+            bucket_cap_mb=args.bucket_cap_mb,
+            process_group=process_group,
+            find_unused_parameters=args.find_unused_parameters,
+            gradient_as_bucket_view=args.gradient_as_bucket_view,
+        )
+        if args.ddp_comm_hook == "fp16":
+            logger.info("enable fp16 communication hook in DDP")
+            try:
+                from torch.distributed.algorithms.ddp_comm_hooks import (
+                    DDPCommHookType,
+                    register_ddp_comm_hook,
+                )
+            except:
+                logger.error(
+                    "Could not import from torch.distributed.algorithms.ddp_comm_hooks; you may need to update your pytorch version"
+                )
+                raise
+
+            register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, wrapped_model)
+        # forward missing getattr and state_dict/load_state_dict to orig model
+        wrapped_model = ModuleProxyWrapper(wrapped_model)
+    elif args.ddp_backend in {"no_c10d", "legacy_ddp"}:
+        wrapped_model = LegacyDistributedDataParallel(
+            module=model.to(device),
+            buffer_size=2**28,
+            process_group=process_group,
+        )
+        # forward missing getattr and state_dict/load_state_dict to orig model
+        wrapped_model = ModuleProxyWrapper(wrapped_model)
+    elif args.ddp_backend == "slowmo":
+        if _SLOWMO_DDP_DISABLED:
+            raise ImportError(
+                "Cannot find SlowMoDistributedDataParallel. "
+                "Please install fairscale with: pip install fairscale"
+            )
+
+        # The values of slowmo_momentum below were obtained by tuning on the
+        # En-De 16 dataset by training the transformer_wmt_en_de_large model
+        if args.slowmo_momentum is None:
+            if args.distributed_world_size <= 16:
+                args.slowmo_momentum = 0.0
+            elif args.distributed_world_size <= 32:
+                args.slowmo_momentum = 0.2
+            elif args.distributed_world_size <= 64:
+                args.slowmo_momentum = 0.5
+            else:
+                args.slowmo_momentum = 0.6
+        slowmo_base_algorithm = SlowMoBaseAlgorithm[args.slowmo_base_algorithm.upper()]
+
+        wrapped_model = SlowMoDistributedDataParallel(
+            module=model.to(device),
+            broadcast_buffers=args.broadcast_buffers,
+            nprocs_per_node=args.nprocs_per_node,
+            slowmo_momentum=args.slowmo_momentum,
+            slowmo_base_algorithm=slowmo_base_algorithm,
+            localsgd_frequency=args.localsgd_frequency,
+        )
+        # forward missing getattr and state_dict/load_state_dict to orig model
+        wrapped_model = ModuleProxyWrapper(wrapped_model)
+    elif args.ddp_backend == "fully_sharded":
+        try:
+            from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP
+        except ImportError:
+            raise ImportError(
+                "Cannot find FullyShardedDataParallel. "
+                "Please install fairscale with: pip install fairscale"
+            )
+        assert isinstance(model, FSDP), "expected model to already be wrapped in FSDP"
+        wrapped_model = model
+        if args.memory_efficient_fp16:
+            wrapped_model = wrapped_model.half()
+        if not args.cpu_offload:
+            wrapped_model = wrapped_model.to(device=device)
+    else:
+        raise ValueError("Unknown --ddp-backend: " + args.ddp_backend)
+
+    # kill hung distributed jobs after a timeout
+    if getattr(args, "heartbeat_timeout", -1) > 0:
+        wrapped_model = DistributedTimeoutWrapper(
+            wrapped_model, timeout=getattr(args, "heartbeat_timeout", -1)
+        )
+
+    return wrapped_model
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
new file mode 100644
index 0000000000..503ceaa609
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+from .ema import EMA
+
+
+def build_ema(model, cfg, device):
+    return EMA(model, cfg, device)
+
+
+# automatically import any Python files in the models/ema/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        file_name = file[: file.find(".py")]
+        importlib.import_module("fairseq.models.ema." + file_name)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py
new file mode 100644
index 0000000000..472d5d5f17
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+
+"""
+This module has the EMA class used to store a copy of the exponentially decayed
+model params.
+
+Typical usage of EMA class involves initializing an object using an existing
+model (random or from a seed model) and setting the config like ema_decay,
+ema_start_update which determine how the EMA model is updated. After every
+update of the model i.e. at the end of the train_step, the EMA should be updated
+by passing the new model to the EMA.step function. The EMA model state dict
+can be stored in the extra state under the key of "ema" and dumped
+into a checkpoint and loaded. The EMA object can be passed to tasks
+by setting task.uses_ema property.
+EMA is a smoothed/ensemble model which might have better performance
+when used for inference or further fine-tuning. EMA class has a
+reverse function to load the EMA params into a model and use it
+like a regular model.
+
+This implementation is used for trainer-level ema tracking. For EMA tracking
+inside the model, please use fairseq/modules/ema_module.py instead.
+"""
+
+import copy
+import logging
+
+import torch
+
+from fairseq import checkpoint_utils
+
+
+class EMA(object):
+    """Exponential Moving Average of Fairseq Models
+    EMA keeps a copy of the exponentially decayed model params.
+    The set of params should include both gradient-descent and
+    non-gradient descent params, such as batch mean/var and buffers.
+    This is a modified implementation of
+    the open source code in https://github.com/zhawe01/fairseq-gec.git,
+    and internal source code in
+    fbcode/mobile-vision/projects/classification_pytorch/lib/utils/model_ema.py.
+
+    Similar to TF EMA.
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage.
+    EMA provides a averaged and smoothed set of model weights, and has been shown to
+    improve vision models. EMA class does all necessary functions to update, reload,
+    or init EMA methods.
+
+    EMA object is initialized from an arbitrary model. By default, it is stored in
+    the same device (unless device specified at initialization) and with the
+    same precision as the model (unless ema_fp32 is True). ema_fp32 is recommended.
+    This stores the EMA parameters in fp32 only for the EMA update step, and
+    is used at the default precision otherwise.
+    EMA is usually enabled using EMAConfig with store_ema=True. Some important
+    parameters to configure EMA are
+    1) ema_decay - The decay of EMA
+    2) ema_update_freq - EMA is updated every this many model updates.
+    3) ema_start_update - Start EMA update after this many model updates [default 0]
+
+    Key methods:
+    1) step - One update of EMA using new model
+    2) restore - Update EMA from a state dict
+    3) reverse - Load EMA into a model
+    4) get_decay, _set_decay - Used to get or set the decay.  Note _set_decay is
+    called from step.
+    5) build_fp32_params - Used to initialize or update the fp32 copy of EMA params.
+    Note this is enabled only when ema_fp32=True
+    """
+
+    def __init__(self, model, config, device=None, skip_keys=None):
+        """
+        @param model model to initialize the EMA with
+        @param config EMAConfig object with configuration like
+        ema_decay, ema_update_freq, ema_fp32
+        @param device If provided, copy EMA to this device (e.g. gpu).
+        Otherwise EMA is in the same device as the model.
+        """
+
+        self.decay = config.ema_decay
+        self.model = copy.deepcopy(model)
+        self.model.requires_grad_(False)
+        self.config = config
+        self.skip_keys = skip_keys or set()
+        self.fp32_params = {}
+
+        if self.config.ema_seed_model is not None:
+            state = checkpoint_utils.load_ema_from_checkpoint(
+                self.config.ema_seed_model
+            )
+            self.model.load_state_dict(state["model"], strict=True)
+
+        if device is not None:
+            logging.info(f"Copying EMA model to device {device}")
+            self.model = self.model.to(device=device)
+
+        if self.config.ema_fp32:
+            self.build_fp32_params()
+
+        self.update_freq_counter = 0
+
+    def get_model(self):
+        return self.model
+
+    def build_fp32_params(self, state_dict=None):
+        """
+        Store a copy of the EMA params in fp32.
+        If state dict is passed, the EMA params is copied from
+        the provided state dict. Otherwise, it is copied from the
+        current EMA model parameters.
+        """
+        if not self.config.ema_fp32:
+            raise RuntimeError(
+                "build_fp32_params should not be called if ema_fp32=False. "
+                "Use ema_fp32=True if this is really intended."
+            )
+
+        if state_dict is None:
+            state_dict = self.model.state_dict()
+
+        def _to_float(t):
+            return t.float() if torch.is_floating_point(t) else t
+
+        for param_key in state_dict:
+            if param_key in self.fp32_params:
+                self.fp32_params[param_key].copy_(state_dict[param_key])
+            else:
+                self.fp32_params[param_key] = _to_float(state_dict[param_key])
+
+    def restore(self, state_dict, build_fp32_params=False):
+        """Load data from a model spec into EMA model"""
+        self.model.load_state_dict(state_dict, strict=False)
+        if build_fp32_params:
+            self.build_fp32_params(state_dict)
+
+    def _set_decay(self, decay):
+        self.decay = decay
+
+    def get_decay(self):
+        return self.decay
+
+    def _step_internal(self, new_model, updates=None):
+        """One update of the EMA model based on new model weights"""
+        decay = self.decay
+
+        ema_state_dict = {}
+        ema_params = (
+            self.fp32_params if self.config.ema_fp32 else self.model.state_dict()
+        )
+        for key, param in new_model.state_dict().items():
+            if isinstance(param, dict):
+                continue
+            try:
+                ema_param = ema_params[key]
+            except KeyError:
+                ema_param = (
+                    param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
+                )
+
+            if param.shape != ema_param.shape:
+                raise ValueError(
+                    "incompatible tensor shapes between model param and ema param"
+                    + "{} vs. {}".format(param.shape, ema_param.shape)
+                )
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+
+            if key in self.skip_keys:
+                ema_param = param.to(dtype=ema_param.dtype).clone()
+            else:
+                ema_param.mul_(decay)
+                ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay)
+            ema_state_dict[key] = ema_param
+        self.restore(ema_state_dict, build_fp32_params=False)
+
+    def step(self, new_model, updates=None):
+        """
+        One update of EMA which is done every self.config.ema_update_freq
+        updates of the model.
+
+        @param updates The current number of model updates done.
+        Decay is set of 0 if model updates < ema_start_update, which means
+        the model will be simply copied over to the EMA.
+        When model updates >= ema_start_updates, then EMA is updated with
+        a decay of self.config.ema_decay.
+        """
+        if updates is not None:
+            self._set_decay(
+                0 if updates < self.config.ema_start_update else self.config.ema_decay
+            )
+        if self.config.ema_update_freq > 1:
+            self.update_freq_counter += 1
+            if self.update_freq_counter >= self.config.ema_update_freq:
+                self._step_internal(new_model, updates)
+                self.update_freq_counter = 0
+        else:
+            self._step_internal(new_model, updates)
+
+    def reverse(self, model):
+        """
+        Load the model parameters from EMA model.
+        Useful for inference or fine-tuning from the EMA model.
+        """
+        d = self.model.state_dict()
+        if "_ema" in d:
+            del d["_ema"]
+
+        model.load_state_dict(d, strict=False)
+        return model
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
new file mode 100644
index 0000000000..13b73d639e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch.nn as nn
+from fairseq import utils
+from torch import Tensor
+
+
+class FairseqDecoder(nn.Module):
+    """Base class for decoders."""
+
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+        self.onnx_trace = False
+        self.adaptive_softmax = None
+
+    def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
+        """
+        Args:
+            prev_output_tokens (LongTensor): shifted output tokens of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (dict, optional): output from the encoder, used for
+                encoder-side attention
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs):
+        """
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        raise NotImplementedError
+
+    def output_layer(self, features, **kwargs):
+        """
+        Project features to the default output size, e.g., vocabulary size.
+
+        Args:
+            features (Tensor): features returned by *extract_features*.
+        """
+        raise NotImplementedError
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+    # TorchScript doesn't support super() method so that the scriptable Subclass
+    # can't access the base class model in Torchscript.
+    # Current workaround is to add a helper function with different name and
+    # call the helper function from scriptable Subclass.
+    def get_normalized_probs_scriptable(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
+            if sample is not None:
+                assert "target" in sample
+                target = sample["target"]
+            else:
+                target = None
+            out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
+            return out.exp_() if not log_probs else out
+
+        logits = net_output[0]
+        if log_probs:
+            return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+        else:
+            return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+
+    def max_positions(self):
+        """Maximum input length supported by the decoder."""
+        return 1e6  # an arbitrary large number
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade old state dicts to work with newer code."""
+        return state_dict
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
new file mode 100644
index 0000000000..08cbde15a4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, NamedTuple, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+EncoderOut = NamedTuple(
+    "EncoderOut",
+    [
+        ("encoder_out", Tensor),  # T x B x C
+        ("encoder_padding_mask", Optional[Tensor]),  # B x T
+        ("encoder_embedding", Optional[Tensor]),  # B x T x C
+        ("encoder_states", Optional[List[Tensor]]),  # List[T x B x C]
+        ("src_tokens", Optional[Tensor]),  # B x T
+        ("src_lengths", Optional[Tensor]),  # B x 1
+    ],
+)
+
+
+class FairseqEncoder(nn.Module):
+    """Base class for encoders."""
+
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of shape
+                `(batch)`
+        """
+        raise NotImplementedError
+
+    def forward_torchscript(self, net_input: Dict[str, Tensor]):
+        """A TorchScript-compatible version of forward.
+
+        Encoders which use additional arguments may want to override
+        this method for TorchScript compatibility.
+        """
+        if torch.jit.is_scripting():
+            return self.forward(
+                src_tokens=net_input["src_tokens"],
+                src_lengths=net_input["src_lengths"],
+            )
+        else:
+            return self.forward_non_torchscript(net_input)
+
+    @torch.jit.unused
+    def forward_non_torchscript(self, net_input: Dict[str, Tensor]):
+        encoder_input = {
+            k: v for k, v in net_input.items() if k != "prev_output_tokens"
+        }
+        return self.forward(**encoder_input)
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """
+        Reorder encoder output according to `new_order`.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            `encoder_out` rearranged according to `new_order`
+        """
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return 1e6  # an arbitrary large number
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade old state dicts to work with newer code."""
+        return state_dict
+
+    def set_num_updates(self, num_updates):
+        """State from trainer to pass along to model at every update."""
+
+        def _apply(m):
+            if hasattr(m, "set_num_updates") and m != self:
+                m.set_num_updates(num_updates)
+
+        self.apply(_apply)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
new file mode 100644
index 0000000000..cc72a0f8f3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.models import FairseqDecoder
+from torch import Tensor
+
+
+logger = logging.getLogger(__name__)
+
+
+@with_incremental_state
+class FairseqIncrementalDecoder(FairseqDecoder):
+    """Base class for incremental decoders.
+
+    Incremental decoding is a special mode at inference time where the Model
+    only receives a single timestep of input corresponding to the previous
+    output token (for teacher forcing) and must produce the next output
+    *incrementally*. Thus the model must cache any long-term state that is
+    needed about the sequence, e.g., hidden states, convolutional states, etc.
+
+    Compared to the standard :class:`FairseqDecoder` interface, the incremental
+    decoder interface allows :func:`forward` functions to take an extra keyword
+    argument (*incremental_state*) that can be used to cache state across
+    time-steps.
+
+    The :class:`FairseqIncrementalDecoder` interface also defines the
+    :func:`reorder_incremental_state` method, which is used during beam search
+    to select and reorder the incremental state based on the selection of beams.
+
+    To learn more about how incremental decoding works, refer to `this blog
+    <http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_.
+    """
+
+    def __init__(self, dictionary):
+        super().__init__(dictionary)
+
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): shifted output tokens of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (dict, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict, optional): dictionary used for storing
+                state during :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        raise NotImplementedError
+
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
+    ):
+        """
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        raise NotImplementedError
+
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Reorder incremental state.
+
+        This will be called when the order of the input has changed from the
+        previous time step. A typical use case is beam search, where the input
+        order changes between time steps based on the selection of beams.
+        """
+        pass
+
+    def reorder_incremental_state_scripting(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Main entry point for reordering the incremental state.
+
+        Due to limitations in TorchScript, we call this function in
+        :class:`fairseq.sequence_generator.SequenceGenerator` instead of
+        calling :func:`reorder_incremental_state` directly.
+        """
+        for module in self.modules():
+            if hasattr(module, "reorder_incremental_state"):
+                result = module.reorder_incremental_state(incremental_state, new_order)
+                if result is not None:
+                    incremental_state = result
+
+    def set_beam_size(self, beam_size):
+        """Sets the beam size in the decoder and all children."""
+        if getattr(self, "_beam_size", -1) != beam_size:
+            seen = set()
+
+            def apply_set_beam_size(module):
+                if (
+                    module != self
+                    and hasattr(module, "set_beam_size")
+                    and module not in seen
+                ):
+                    seen.add(module)
+                    module.set_beam_size(beam_size)
+
+            self.apply(apply_set_beam_size)
+            self._beam_size = beam_size
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
new file mode 100644
index 0000000000..65ead9dcf2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
@@ -0,0 +1,579 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base classes for various fairseq models.
+"""
+
+import logging
+from argparse import Namespace
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.data import Dictionary
+from fairseq.dataclass.utils import (
+    convert_namespace_to_omegaconf,
+    gen_parser_from_dataclass,
+)
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from omegaconf import DictConfig
+from torch import Tensor
+
+
+logger = logging.getLogger(__name__)
+
+
+def check_type(module, expected_type):
+    if hasattr(module, "unwrapped_module"):
+        assert isinstance(
+            module.unwrapped_module, expected_type
+        ), f"{type(module.unwrapped_module)} != {expected_type}"
+    else:
+        assert isinstance(module, expected_type), f"{type(module)} != {expected_type}"
+
+
+class BaseFairseqModel(nn.Module):
+    """Base class for fairseq models."""
+
+    def __init__(self):
+        super().__init__()
+        self._is_generation_fast = False
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add model-specific arguments to the parser."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            # do not set defaults so that settings defaults from various architectures still works
+            gen_parser_from_dataclass(parser, dc(), delete_default=True)
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        raise NotImplementedError("Model must implement the build_model method")
+
+    def get_targets(self, sample, net_output):
+        """Get targets from either the sample or the net's output."""
+        return sample["target"]
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+    # TorchScript doesn't support super() method so that the scriptable Subclass
+    # can't access the base class model in Torchscript.
+    # Current workaround is to add a helper function with different name and
+    # call the helper function from scriptable Subclass.
+    def get_normalized_probs_scriptable(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Scriptable helper function for get_normalized_probs in ~BaseFairseqModel"""
+        if hasattr(self, "decoder"):
+            return self.decoder.get_normalized_probs(net_output, log_probs, sample)
+        elif torch.is_tensor(net_output):
+            # syntactic sugar for simple models which don't have a decoder
+            # (e.g., the classification tutorial)
+            logits = net_output.float()
+            if log_probs:
+                return F.log_softmax(logits, dim=-1)
+            else:
+                return F.softmax(logits, dim=-1)
+        raise NotImplementedError
+
+    def extract_features(self, *args, **kwargs):
+        """Similar to *forward* but only return features."""
+        return self(*args, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return None
+
+    def load_state_dict(
+        self,
+        state_dict,
+        strict=True,
+        model_cfg: Optional[DictConfig] = None,
+        args: Optional[Namespace] = None,
+    ):
+        """Copies parameters and buffers from *state_dict* into this module and
+        its descendants.
+
+        Overrides the method in :class:`nn.Module`. Compared with that method
+        this additionally "upgrades" *state_dicts* from old checkpoints.
+        """
+
+        if model_cfg is None and args is not None:
+            logger.warn(
+                "using 'args' is deprecated, please update your code to use dataclass config"
+            )
+            model_cfg = convert_namespace_to_omegaconf(args).model
+
+        self.upgrade_state_dict(state_dict)
+
+        from fairseq.checkpoint_utils import prune_state_dict
+
+        new_state_dict = prune_state_dict(state_dict, model_cfg)
+        return super().load_state_dict(new_state_dict, strict)
+
+    def upgrade_state_dict(self, state_dict):
+        """Upgrade old state dicts to work with newer code."""
+        self.upgrade_state_dict_named(state_dict, "")
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade old state dicts to work with newer code.
+
+        Args:
+            state_dict (dict): state dictionary to upgrade, in place
+            name (str): the state dict key corresponding to the current module
+        """
+        assert state_dict is not None
+
+        def do_upgrade(m, prefix):
+            if len(prefix) > 0:
+                prefix += "."
+
+            for n, c in m.named_children():
+                name = prefix + n
+                if hasattr(c, "upgrade_state_dict_named"):
+                    c.upgrade_state_dict_named(state_dict, name)
+                elif hasattr(c, "upgrade_state_dict"):
+                    c.upgrade_state_dict(state_dict)
+                do_upgrade(c, name)
+
+        do_upgrade(self, name)
+
+    def set_num_updates(self, num_updates):
+        """State from trainer to pass along to model at every update."""
+        for m in self.modules():
+            if hasattr(m, "set_num_updates") and m != self:
+                m.set_num_updates(num_updates)
+
+    def set_epoch(self, epoch):
+        for m in self.modules():
+            if hasattr(m, "set_epoch") and m != self:
+                m.set_epoch(epoch)
+
+    def prepare_for_inference_(self, cfg: DictConfig):
+        """Prepare model for inference."""
+        kwargs = {}
+        kwargs["beamable_mm_beam_size"] = (
+            None
+            if getattr(cfg.generation, "no_beamable_mm", False)
+            else getattr(cfg.generation, "beam", 5)
+        )
+        kwargs["need_attn"] = getattr(cfg.generation, "print_alignment", False)
+        if getattr(cfg.generation, "retain_dropout", False):
+            kwargs["retain_dropout"] = cfg.generation.retain_dropout
+            kwargs["retain_dropout_modules"] = cfg.generation.retain_dropout_modules
+        self.make_generation_fast_(**kwargs)
+
+    def make_generation_fast_(self, **kwargs):
+        """
+        Legacy entry point to optimize model for faster generation.
+        Prefer prepare_for_inference_.
+        """
+        if self._is_generation_fast:
+            return  # only apply once
+        self._is_generation_fast = True
+
+        # remove weight norm from all modules in the network
+        def apply_remove_weight_norm(module):
+            try:
+                nn.utils.remove_weight_norm(module)
+            except (AttributeError, ValueError):  # this module didn't have weight norm
+                return
+
+        self.apply(apply_remove_weight_norm)
+
+        def apply_make_generation_fast_(module, prefix):
+            if len(prefix) > 0:
+                prefix += "."
+
+            base_func = BaseFairseqModel.make_generation_fast_
+            for n, m in module.named_modules():
+                if (
+                    m != self
+                    and hasattr(m, "make_generation_fast_")
+                    # don't call this implementation again, e.g., if
+                    # children modules also inherit from BaseFairseqModel
+                    and m.make_generation_fast_.__func__ is not base_func
+                ):
+                    name = prefix + n
+                    m.make_generation_fast_(name=name, **kwargs)
+
+        apply_make_generation_fast_(self, "")
+
+        def train(mode=True):
+            if mode:
+                raise RuntimeError("cannot train after make_generation_fast")
+
+        # this model should no longer be used for training
+        self.eval()
+        self.train = train
+
+    def prepare_for_onnx_export_(self, **kwargs):
+        """Make model exportable via ONNX trace."""
+        seen = set()
+
+        def apply_prepare_for_onnx_export_(module):
+            if (
+                module != self
+                and hasattr(module, "prepare_for_onnx_export_")
+                and module not in seen
+            ):
+                seen.add(module)
+                module.prepare_for_onnx_export_(**kwargs)
+
+        self.apply(apply_prepare_for_onnx_export_)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        **kwargs,
+    ):
+        """
+        Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
+        file. Downloads and caches the pre-trained model file if needed.
+
+        The base implementation returns a
+        :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
+        generate translations or sample from language models. The underlying
+        :class:`~fairseq.models.FairseqModel` can be accessed via the
+        *generator.models* attribute.
+
+        Other models may override this to implement custom hub interfaces.
+
+        Args:
+            model_name_or_path (str): either the name of a pre-trained model to
+                load or a path/URL to a pre-trained model state dict
+            checkpoint_file (str, optional): colon-separated list of checkpoint
+                files in the model archive to ensemble (default: 'model.pt')
+            data_name_or_path (str, optional): point args.data to the archive
+                at the given path/URL. Can start with '.' or './' to reuse the
+                model archive path.
+        """
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            **kwargs,
+        )
+        logger.info(x["args"])
+        return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"])
+
+    @classmethod
+    def hub_models(cls):
+        return {}
+
+
+class FairseqEncoderDecoderModel(BaseFairseqModel):
+    """Base class for encoder-decoder models.
+
+    Args:
+        encoder (FairseqEncoder): the encoder
+        decoder (FairseqDecoder): the decoder
+    """
+
+    def __init__(self, encoder, decoder):
+        super().__init__()
+
+        self.encoder = encoder
+        self.decoder = decoder
+
+        check_type(self.encoder, FairseqEncoder)
+        check_type(self.decoder, FairseqDecoder)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        """
+        Run the forward pass for an encoder-decoder model.
+
+        First feed a batch of source tokens through the encoder. Then, feed the
+        encoder output and previous decoder outputs (i.e., teacher forcing) to
+        the decoder to produce the next outputs::
+
+            encoder_out = self.encoder(src_tokens, src_lengths)
+            return self.decoder(prev_output_tokens, encoder_out)
+
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        decoder_out = self.decoder(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        return decoder_out
+
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
+    def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        features = self.decoder.extract_features(
+            prev_output_tokens, encoder_out=encoder_out, **kwargs
+        )
+        return features
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the default output size (typically vocabulary size)."""
+        return self.decoder.output_layer(features, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return (self.encoder.max_positions(), self.decoder.max_positions())
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return self.decoder.max_positions()
+
+
+class FairseqModel(FairseqEncoderDecoderModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        utils.deprecation_warning(
+            "FairseqModel is deprecated, please use FairseqEncoderDecoderModel "
+            "or BaseFairseqModel instead",
+            stacklevel=4,
+        )
+
+
+class FairseqMultiModel(BaseFairseqModel):
+    """Base class for combining multiple encoder-decoder models."""
+
+    def __init__(self, encoders, decoders):
+        super().__init__()
+        assert encoders.keys() == decoders.keys()
+        self.keys = list(encoders.keys())
+        for key in self.keys:
+            check_type(encoders[key], FairseqEncoder)
+            check_type(decoders[key], FairseqDecoder)
+
+        self.models = nn.ModuleDict(
+            {
+                key: FairseqEncoderDecoderModel(encoders[key], decoders[key])
+                for key in self.keys
+            }
+        )
+
+    @staticmethod
+    def build_shared_embeddings(
+        dicts: Dict[str, Dictionary],
+        langs: List[str],
+        embed_dim: int,
+        build_embedding: callable,
+        pretrained_embed_path: Optional[str] = None,
+    ):
+        """
+        Helper function to build shared embeddings for a set of languages after
+        checking that all dicts corresponding to those languages are equivalent.
+
+        Args:
+            dicts: Dict of lang_id to its corresponding Dictionary
+            langs: languages that we want to share embeddings for
+            embed_dim: embedding dimension
+            build_embedding: callable function to actually build the embedding
+            pretrained_embed_path: Optional path to load pretrained embeddings
+        """
+        shared_dict = dicts[langs[0]]
+        if any(dicts[lang] != shared_dict for lang in langs):
+            raise ValueError(
+                "--share-*-embeddings requires a joined dictionary: "
+                "--share-encoder-embeddings requires a joined source "
+                "dictionary, --share-decoder-embeddings requires a joined "
+                "target dictionary, and --share-all-embeddings requires a "
+                "joint source + target dictionary."
+            )
+        return build_embedding(shared_dict, embed_dim, pretrained_embed_path)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return {
+            key: (
+                self.models[key].encoder.max_positions(),
+                self.models[key].decoder.max_positions(),
+            )
+            for key in self.keys
+        }
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return min(model.decoder.max_positions() for model in self.models.values())
+
+    @property
+    def encoder(self):
+        return self.models[self.keys[0]].encoder
+
+    @property
+    def decoder(self):
+        return self.models[self.keys[0]].decoder
+
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
+    def load_state_dict(
+        self,
+        state_dict,
+        strict=True,
+        model_cfg=None,
+        args: Optional[Namespace] = None,
+    ):
+        """Copies parameters and buffers from *state_dict* into this module and
+        its descendants.
+
+        Overrides the method in :class:`nn.Module`. Compared with that method
+        this additionally "upgrades" *state_dicts* from old checkpoints.
+        """
+
+        if model_cfg is None and args is not None:
+            logger.warn(
+                "using 'args' is deprecated, please update your code to use dataclass config"
+            )
+            model_cfg = convert_namespace_to_omegaconf(args).model
+
+        self.upgrade_state_dict(state_dict)
+
+        from fairseq.checkpoint_utils import prune_state_dict
+
+        new_state_dict = prune_state_dict(state_dict, model_cfg)
+        return super().load_state_dict(new_state_dict, strict)
+
+
+class FairseqLanguageModel(BaseFairseqModel):
+    """Base class for decoder-only models.
+
+    Args:
+        decoder (FairseqDecoder): the decoder
+    """
+
+    def __init__(self, decoder):
+        super().__init__()
+        self.decoder = decoder
+        check_type(self.decoder, FairseqDecoder)
+
+    def forward(self, src_tokens, **kwargs):
+        """
+        Run the forward pass for a decoder-only model.
+
+        Feeds a batch of tokens through the decoder to predict the next tokens.
+
+        Args:
+            src_tokens (LongTensor): tokens on which to condition the decoder,
+                of shape `(batch, tgt_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, seq_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        return self.decoder(src_tokens, **kwargs)
+
+    def forward_decoder(self, prev_output_tokens, **kwargs):
+        return self.decoder(prev_output_tokens, **kwargs)
+
+    def extract_features(self, src_tokens, **kwargs):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, seq_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        return self.decoder.extract_features(src_tokens, **kwargs)
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the default output size (typically vocabulary size)."""
+        return self.decoder.output_layer(features, **kwargs)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return self.decoder.max_positions()
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return self.decoder.max_positions()
+
+    @property
+    def supported_targets(self):
+        return {"future"}
+
+
+class FairseqEncoderModel(BaseFairseqModel):
+    """Base class for encoder-only models.
+
+    Args:
+        encoder (FairseqEncoder): the encoder
+    """
+
+    def __init__(self, encoder):
+        super().__init__()
+        self.encoder = encoder
+        check_type(self.encoder, FairseqEncoder)
+
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        Run the forward pass for a encoder-only model.
+
+        Feeds a batch of tokens through the encoder to generate features.
+
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+
+        Returns:
+            the encoder's output, typically of shape `(batch, src_len, features)`
+        """
+        return self.encoder(src_tokens, src_lengths, **kwargs)
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        encoder_out = net_output["encoder_out"]
+        if torch.is_tensor(encoder_out):
+            logits = encoder_out.float()
+            if log_probs:
+                return F.log_softmax(logits, dim=-1)
+            else:
+                return F.softmax(logits, dim=-1)
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return self.encoder.max_positions()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
new file mode 100644
index 0000000000..c99a215101
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
@@ -0,0 +1,756 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    BeamableMM,
+    FairseqDropout,
+    GradMultiply,
+    LearnedPositionalEmbedding,
+    LinearizedConvolution,
+)
+
+
+@register_model("fconv")
+class FConvModel(FairseqEncoderDecoderModel):
+    """
+    A fully convolutional model, i.e. a convolutional encoder and a
+    convolutional decoder, as described in `"Convolutional Sequence to Sequence
+    Learning" (Gehring et al., 2017) <https://arxiv.org/abs/1705.03122>`_.
+
+    Args:
+        encoder (FConvEncoder): the encoder
+        decoder (FConvDecoder): the decoder
+
+    The Convolutional model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.fconv_parser
+        :prog:
+    """
+
+    @classmethod
+    def hub_models(cls):
+        def moses_subword(path):
+            return {
+                "path": path,
+                "tokenizer": "moses",
+                "bpe": "subword_nmt",
+            }
+
+        return {
+            "conv.wmt14.en-fr": moses_subword(
+                "https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2"
+            ),
+            "conv.wmt14.en-de": moses_subword(
+                "https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2"
+            ),
+            "conv.wmt17.en-de": moses_subword(
+                "https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2"
+            ),
+        }
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.encoder.num_attention_layers = sum(
+            layer is not None for layer in decoder.attention
+        )
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
+                            help='encoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--share-input-output-embed', action='store_true',
+                            help='share input and output embeddings (requires'
+                                 ' --decoder-out-embed-dim and --decoder-embed-dim'
+                                 ' to be equal)')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+
+        encoder_embed_dict = None
+        if args.encoder_embed_path:
+            encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path)
+            utils.print_embed_overlap(encoder_embed_dict, task.source_dictionary)
+
+        decoder_embed_dict = None
+        if args.decoder_embed_path:
+            decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path)
+            utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary)
+
+        encoder = FConvEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            embed_dict=encoder_embed_dict,
+            convolutions=eval(args.encoder_layers),
+            dropout=args.dropout,
+            max_positions=args.max_source_positions,
+        )
+        decoder = FConvDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            embed_dict=decoder_embed_dict,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_out_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.max_target_positions,
+            share_embed=args.share_input_output_embed,
+        )
+        return FConvModel(encoder, decoder)
+
+
+class FConvEncoder(FairseqEncoder):
+    """
+    Convolutional encoder consisting of `len(convolutions)` layers.
+
+    Args:
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_dim (int, optional): embedding dimension
+        embed_dict (str, optional): filename from which to load pre-trained
+            embeddings
+        max_positions (int, optional): maximum supported input sequence length
+        convolutions (list, optional): the convolutional layer structure. Each
+            list item `i` corresponds to convolutional layer `i`. Layers are
+            given as ``(out_channels, kernel_width, [residual])``. Residual
+            connections are added between layers when ``residual=1`` (which is
+            the default behavior).
+        dropout (float, optional): dropout to be applied before each conv layer
+    """
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        embed_dict=None,
+        max_positions=1024,
+        convolutions=((512, 3),) * 20,
+        dropout=0.1,
+    ):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.num_attention_layers = None
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        if embed_dict:
+            self.embed_tokens = utils.load_embedding(
+                embed_dict, self.dictionary, self.embed_tokens
+            )
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+
+        convolutions = extend_conv_spec(convolutions)
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.residuals = []
+
+        layer_in_channels = [in_channels]
+        for _, (out_channels, kernel_size, residual) in enumerate(convolutions):
+            if residual == 0:
+                residual_dim = out_channels
+            else:
+                residual_dim = layer_in_channels[-residual]
+            self.projections.append(
+                Linear(residual_dim, out_channels)
+                if residual_dim != out_channels
+                else None
+            )
+            if kernel_size % 2 == 1:
+                padding = kernel_size // 2
+            else:
+                padding = 0
+            self.convolutions.append(
+                ConvTBC(
+                    in_channels,
+                    out_channels * 2,
+                    kernel_size,
+                    dropout=dropout,
+                    padding=padding,
+                )
+            )
+            self.residuals.append(residual)
+            in_channels = out_channels
+            layer_in_channels.append(out_channels)
+        self.fc2 = Linear(in_channels, embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of shape
+                `(batch)`
+
+        Returns:
+            dict:
+                - **encoder_out** (tuple): a tuple with two elements, where the
+                  first element is the last encoder layer's output and the
+                  second element is the same quantity summed with the input
+                  embedding (used for attention). The shape of both tensors is
+                  `(batch, src_len, embed_dim)`.
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+        """
+        # embed tokens and positions
+        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
+        x = self.dropout_module(x)
+        input_embedding = x
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # used to mask padding in input
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        residuals = [x]
+        # temporal convolutions
+        for proj, conv, res_layer in zip(
+            self.projections, self.convolutions, self.residuals
+        ):
+            if res_layer > 0:
+                residual = residuals[-res_layer]
+                residual = residual if proj is None else proj(residual)
+            else:
+                residual = None
+
+            if encoder_padding_mask is not None:
+                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+            x = self.dropout_module(x)
+            if conv.kernel_size[0] % 2 == 1:
+                # padding is implicit in the conv
+                x = conv(x)
+            else:
+                padding_l = (conv.kernel_size[0] - 1) // 2
+                padding_r = conv.kernel_size[0] // 2
+                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
+                x = conv(x)
+            x = F.glu(x, dim=2)
+
+            if residual is not None:
+                x = (x + residual) * math.sqrt(0.5)
+            residuals.append(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # project back to size of embedding
+        x = self.fc2(x)
+
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
+            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+        # scale gradients (this only affects backward, not forward)
+        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
+
+        # add output to input embedding for attention
+        y = (x + input_embedding) * math.sqrt(0.5)
+
+        return {
+            "encoder_out": (x, y),
+            "encoder_padding_mask": encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out["encoder_out"] is not None:
+            encoder_out["encoder_out"] = (
+                encoder_out["encoder_out"][0].index_select(0, new_order),
+                encoder_out["encoder_out"][1].index_select(0, new_order),
+            )
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(0, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.embed_positions.max_positions
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, conv_channels, embed_dim, bmm=None):
+        super().__init__()
+        # projects from output of convolution to embedding dimension
+        self.in_projection = Linear(conv_channels, embed_dim)
+        # projects from embedding dimension to convolution size
+        self.out_projection = Linear(embed_dim, conv_channels)
+
+        self.bmm = bmm if bmm is not None else torch.bmm
+
+    def forward(self, x, target_embedding, encoder_out, encoder_padding_mask):
+        residual = x
+
+        # attention
+        x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5)
+        x = self.bmm(x, encoder_out[0])
+
+        # don't attend over padding
+        if encoder_padding_mask is not None:
+            x = (
+                x.float()
+                .masked_fill(encoder_padding_mask.unsqueeze(1), float("-inf"))
+                .type_as(x)
+            )  # FP16 support: cast to float and back
+
+        # softmax over last dim
+        sz = x.size()
+        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
+        x = x.view(sz)
+        attn_scores = x
+
+        x = self.bmm(x, encoder_out[1])
+
+        # scale attention output (respecting potentially different lengths)
+        s = encoder_out[1].size(1)
+        if encoder_padding_mask is None:
+            x = x * (s * math.sqrt(1.0 / s))
+        else:
+            s = s - encoder_padding_mask.type_as(x).sum(
+                dim=1, keepdim=True
+            )  # exclude padding
+            s = s.unsqueeze(-1)
+            x = x * (s * s.rsqrt())
+
+        # project back
+        x = (self.out_projection(x) + residual) * math.sqrt(0.5)
+        return x, attn_scores
+
+    def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs):
+        """Replace torch.bmm with BeamableMM."""
+        if beamable_mm_beam_size is not None:
+            del self.bmm
+            self.add_module("bmm", BeamableMM(beamable_mm_beam_size))
+
+
+class FConvDecoder(FairseqIncrementalDecoder):
+    """Convolutional decoder"""
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        embed_dict=None,
+        out_embed_dim=256,
+        max_positions=1024,
+        convolutions=((512, 3),) * 20,
+        attention=True,
+        dropout=0.1,
+        share_embed=False,
+        positional_embeddings=True,
+        adaptive_softmax_cutoff=None,
+        adaptive_softmax_dropout=0.0,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([2]))
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.need_attn = True
+
+        convolutions = extend_conv_spec(convolutions)
+        in_channels = convolutions[0][0]
+        if isinstance(attention, bool):
+            # expand True into [True, True, ...] and do the same with False
+            attention = [attention] * len(convolutions)
+        if not isinstance(attention, list) or len(attention) != len(convolutions):
+            raise ValueError(
+                "Attention is expected to be a list of booleans of "
+                "length equal to the number of layers."
+            )
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        if embed_dict:
+            self.embed_tokens = utils.load_embedding(
+                embed_dict, self.dictionary, self.embed_tokens
+            )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                max_positions,
+                embed_dim,
+                padding_idx,
+            )
+            if positional_embeddings
+            else None
+        )
+
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.residuals = []
+
+        layer_in_channels = [in_channels]
+        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
+            if residual == 0:
+                residual_dim = out_channels
+            else:
+                residual_dim = layer_in_channels[-residual]
+            self.projections.append(
+                Linear(residual_dim, out_channels)
+                if residual_dim != out_channels
+                else None
+            )
+            self.convolutions.append(
+                LinearizedConv1d(
+                    in_channels,
+                    out_channels * 2,
+                    kernel_size,
+                    padding=(kernel_size - 1),
+                    dropout=dropout,
+                )
+            )
+            self.attention.append(
+                AttentionLayer(out_channels, embed_dim) if attention[i] else None
+            )
+            self.residuals.append(residual)
+            in_channels = out_channels
+            layer_in_channels.append(out_channels)
+
+        self.adaptive_softmax = None
+        self.fc2 = self.fc3 = None
+
+        if adaptive_softmax_cutoff is not None:
+            assert not share_embed
+            self.adaptive_softmax = AdaptiveSoftmax(
+                num_embeddings,
+                in_channels,
+                adaptive_softmax_cutoff,
+                dropout=adaptive_softmax_dropout,
+            )
+        else:
+            self.fc2 = Linear(in_channels, out_embed_dim)
+            if share_embed:
+                assert out_embed_dim == embed_dim, (
+                    "Shared embed weights implies same dimensions "
+                    " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
+                )
+                self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
+                self.fc3.weight = self.embed_tokens.weight
+            else:
+                self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        if encoder_out is not None:
+            encoder_padding_mask = encoder_out["encoder_padding_mask"]
+            encoder_out = encoder_out["encoder_out"]
+
+            # split and transpose encoder outputs
+            encoder_a, encoder_b = self._split_encoder_out(
+                encoder_out, incremental_state
+            )
+
+        if self.embed_positions is not None:
+            pos_embed = self.embed_positions(prev_output_tokens, incremental_state)
+        else:
+            pos_embed = 0
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        x = self._embed_tokens(prev_output_tokens, incremental_state)
+
+        # embed tokens and combine with positional embeddings
+        x += pos_embed
+        x = self.dropout_module(x)
+        target_embedding = x
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        # temporal convolutions
+        avg_attn_scores = None
+        num_attn_layers = len(self.attention)
+        residuals = [x]
+        for proj, conv, attention, res_layer in zip(
+            self.projections, self.convolutions, self.attention, self.residuals
+        ):
+            if res_layer > 0:
+                residual = residuals[-res_layer]
+                residual = residual if proj is None else proj(residual)
+            else:
+                residual = None
+
+            x = self.dropout_module(x)
+            x = conv(x, incremental_state)
+            x = F.glu(x, dim=2)
+
+            # attention
+            if attention is not None:
+                x = self._transpose_if_training(x, incremental_state)
+
+                x, attn_scores = attention(
+                    x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask
+                )
+
+                if not self.training and self.need_attn:
+                    attn_scores = attn_scores / num_attn_layers
+                    if avg_attn_scores is None:
+                        avg_attn_scores = attn_scores
+                    else:
+                        avg_attn_scores.add_(attn_scores)
+
+                x = self._transpose_if_training(x, incremental_state)
+
+            # residual
+            if residual is not None:
+                x = (x + residual) * math.sqrt(0.5)
+            residuals.append(x)
+
+        # T x B x C -> B x T x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        # project back to size of vocabulary if not using adaptive softmax
+        if self.fc2 is not None and self.fc3 is not None:
+            x = self.fc2(x)
+            x = self.dropout_module(x)
+            x = self.fc3(x)
+
+        return x, avg_attn_scores
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        super().reorder_incremental_state(incremental_state, new_order)
+        encoder_out = utils.get_incremental_state(
+            self, incremental_state, "encoder_out"
+        )
+        if encoder_out is not None:
+            encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out)
+            utils.set_incremental_state(
+                self, incremental_state, "encoder_out", encoder_out
+            )
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return (
+            self.embed_positions.max_positions
+            if self.embed_positions is not None
+            else float("inf")
+        )
+
+    def upgrade_state_dict(self, state_dict):
+        if utils.item(state_dict.get("decoder.version", torch.Tensor([1]))[0]) < 2:
+            # old models use incorrect weight norm dimension
+            for i, conv in enumerate(self.convolutions):
+                # reconfigure weight norm
+                nn.utils.remove_weight_norm(conv)
+                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
+            state_dict["decoder.version"] = torch.Tensor([1])
+        return state_dict
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def _embed_tokens(self, tokens, incremental_state):
+        if incremental_state is not None:
+            # keep only the last token for incremental forward pass
+            tokens = tokens[:, -1:]
+        return self.embed_tokens(tokens)
+
+    def _split_encoder_out(self, encoder_out, incremental_state):
+        """Split and transpose encoder outputs.
+
+        This is cached when doing incremental inference.
+        """
+        cached_result = utils.get_incremental_state(
+            self, incremental_state, "encoder_out"
+        )
+        if cached_result is not None:
+            return cached_result
+
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(1, 2).contiguous()
+        result = (encoder_a, encoder_b)
+
+        if incremental_state is not None:
+            utils.set_incremental_state(self, incremental_state, "encoder_out", result)
+        return result
+
+    def _transpose_if_training(self, x, incremental_state):
+        if incremental_state is None:
+            x = x.transpose(0, 1)
+        return x
+
+
+def extend_conv_spec(convolutions):
+    """
+    Extends convolutional spec that is a list of tuples of 2 or 3 parameters
+    (kernel size, dim size and optionally how many layers behind to look for residual)
+    to default the residual propagation param if it is not specified
+    """
+    extended = []
+    for spec in convolutions:
+        if len(spec) == 3:
+            extended.append(spec)
+        elif len(spec) == 2:
+            extended.append(spec + (1,))
+        else:
+            raise Exception(
+                "invalid number of parameters in convolution spec "
+                + str(spec)
+                + ". expected 2 or 3"
+            )
+    return tuple(extended)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, 0, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
+    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+    nn.init.normal_(m.weight, 0, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, dropout=0.0):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    nn.init.normal_(m.weight, mean=0, std=math.sqrt((1 - dropout) / in_features))
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m)
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+@register_model_architecture("fconv", "fconv")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 20")
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 20")
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
+    args.decoder_attention = getattr(args, "decoder_attention", "True")
+    args.share_input_output_embed = getattr(args, "share_input_output_embed", False)
+
+
+@register_model_architecture("fconv", "fconv_iwslt_de_en")
+def fconv_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_layers = getattr(args, "encoder_layers", "[(256, 3)] * 4")
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_layers = getattr(args, "decoder_layers", "[(256, 3)] * 3")
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
+    base_architecture(args)
+
+
+@register_model_architecture("fconv", "fconv_wmt_en_ro")
+def fconv_wmt_en_ro(args):
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
+    base_architecture(args)
+
+
+@register_model_architecture("fconv", "fconv_wmt_en_de")
+def fconv_wmt_en_de(args):
+    convs = "[(512, 3)] * 9"  # first 9 layers have 512 units
+    convs += " + [(1024, 3)] * 4"  # next 4 layers have 1024 units
+    convs += " + [(2048, 1)] * 2"  # final 2 layers use 1x1 convolutions
+
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_layers = getattr(args, "encoder_layers", convs)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768)
+    args.decoder_layers = getattr(args, "decoder_layers", convs)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
+    base_architecture(args)
+
+
+@register_model_architecture("fconv", "fconv_wmt_en_fr")
+def fconv_wmt_en_fr(args):
+    convs = "[(512, 3)] * 6"  # first 6 layers have 512 units
+    convs += " + [(768, 3)] * 4"  # next 4 layers have 768 units
+    convs += " + [(1024, 3)] * 3"  # next 3 layers have 1024 units
+    convs += " + [(2048, 1)] * 1"  # next 1 layer uses 1x1 convolutions
+    convs += " + [(4096, 1)] * 1"  # final 1 layer uses 1x1 convolutions
+
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_layers = getattr(args, "encoder_layers", convs)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768)
+    args.decoder_layers = getattr(args, "decoder_layers", convs)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
new file mode 100644
index 0000000000..4b243d6669
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.fconv import FConvDecoder
+from fairseq.utils import safe_hasattr
+
+
+@register_model("fconv_lm")
+class FConvLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-layers",
+            type=str,
+            metavar="EXPR",
+            help="decoder layers [(dim, kernel_size), ...]",
+        )
+        parser.add_argument(
+            "--decoder-out-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder output embedding dimension",
+        )
+        parser.add_argument(
+            "--adaptive-softmax-cutoff",
+            metavar="EXPR",
+            help="comma separated list of adaptive softmax cutoff points. "
+            "Must be used with adaptive_loss criterion",
+        )
+        parser.add_argument(
+            "--adaptive-softmax-dropout",
+            type=float,
+            metavar="D",
+            help="sets adaptive softmax dropout for the tail projections",
+        )
+        parser.add_argument(
+            "--decoder-attention",
+            type=str,
+            metavar="EXPR",
+            help="decoder attention [True, ...]",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if safe_hasattr(args, "max_target_positions") and not safe_hasattr(
+            args, "tokens_per_sample"
+        ):
+            args.tokens_per_sample = args.max_target_positions
+
+        decoder = FConvDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.tokens_per_sample,
+            share_embed=False,
+            positional_embeddings=False,
+            adaptive_softmax_cutoff=(
+                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == "adaptive_loss"
+                else None
+            ),
+            adaptive_softmax_dropout=args.adaptive_softmax_dropout,
+        )
+        return FConvLanguageModel(decoder)
+
+
+@register_model_architecture("fconv_lm", "fconv_lm")
+def base_lm_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128)
+    args.decoder_layers = getattr(args, "decoder_layers", "[(1268, 4)] * 13")
+    args.decoder_attention = getattr(args, "decoder_attention", "False")
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+
+
+@register_model_architecture("fconv_lm", "fconv_lm_dauphin_wikitext103")
+def fconv_lm_dauphin_wikitext103(args):
+    layers = "[(850, 6)] * 3"
+    layers += " + [(850, 1)] * 1"
+    layers += " + [(850, 5)] * 4"
+    layers += " + [(850, 1)] * 1"
+    layers += " + [(850, 4)] * 3"
+    layers += " + [(1024, 4)] * 1"
+    layers += " + [(2048, 4)] * 1"
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 280)
+    args.decoder_layers = getattr(args, "decoder_layers", layers)
+    args.decoder_attention = getattr(args, "decoder_attention", "False")
+    args.adaptive_softmax_cutoff = getattr(
+        args, "adaptive_softmax_cutoff", "10000,20000,200000"
+    )
+    base_lm_architecture(args)
+
+
+@register_model_architecture("fconv_lm", "fconv_lm_dauphin_gbw")
+def fconv_lm_dauphin_gbw(args):
+    layers = "[(512, 5)]"
+    layers += " + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3"
+    layers += " + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3"
+    layers += " + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6"
+    layers += " + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]"
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128)
+    args.decoder_layers = getattr(args, "decoder_layers", layers)
+    args.decoder_attention = getattr(args, "decoder_attention", "False")
+    args.adaptive_softmax_cutoff = getattr(
+        args, "adaptive_softmax_cutoff", "10000,50000,200000"
+    )
+    base_lm_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
new file mode 100644
index 0000000000..8357ef7847
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
@@ -0,0 +1,674 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import checkpoint_utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.models import (
+    CompositeEncoder,
+    FairseqDecoder,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    DownsampledMultiHeadAttention,
+    FairseqDropout,
+    GradMultiply,
+    LayerNorm,
+    LearnedPositionalEmbedding,
+    LinearizedConvolution,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("fconv_self_att")
+class FConvModelSelfAtt(FairseqEncoderDecoderModel):
+    @classmethod
+    def hub_models(cls):
+        return {
+            "conv.stories.pretrained": {
+                "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz",
+                "checkpoint_file": "pretrained_checkpoint.pt",
+                "tokenizer": "nltk",
+            },
+            "conv.stories": {
+                "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz",
+                "checkpoint_file": "fusion_checkpoint.pt",
+                "tokenizer": "nltk",
+                "pretrained": "True",
+                "pretrained_checkpoint": "./pretrained_checkpoint.pt",
+            },
+            # Test set containing dictionaries
+            "data.stories": "https://dl.fbaipublicfiles.com/fairseq/data/stories_test.tar.bz2",
+        }
+
+    def __init__(self, encoder, decoder, pretrained_encoder=None):
+        super().__init__(encoder, decoder)
+        self.encoder.num_attention_layers = sum(
+            layer is not None for layer in decoder.attention
+        )
+        self.pretrained_encoder = pretrained_encoder
+        if self.pretrained_encoder is None:
+            encoders = {"encoder": encoder}
+        else:
+            encoders = {"encoder": encoder, "pretrained": self.pretrained_encoder}
+        # for fusion model, CompositeEncoder contains both pretrained and training encoders
+        # these are forwarded and then combined in the decoder
+        self.encoder = CompositeEncoder(encoders)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
+                            help='encoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--self-attention', type=str, metavar='EXPR',
+                            help='decoder self-attention layers, ex: [True] + [False]*5')
+        parser.add_argument('--multihead-attention-nheads', type=int,
+                            help='Number of heads to use in attention')
+        parser.add_argument('--multihead-self-attention-nheads', type=int,
+                            help='Number of heads to use in self-attention')
+        parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
+                            help='encoder attention [True, ...]')
+        parser.add_argument('--encoder-attention-nheads', type=int,
+                            help='Number of heads to use in encoder attention')
+        parser.add_argument('--project-input', type=str, metavar='EXPR',
+                            help='Use projections in self-attention [True, ...]')
+        parser.add_argument('--gated-attention', type=str, metavar='EXPR',
+                            help='Use GLU layers in self-attention projections [True, ...]')
+        parser.add_argument('--downsample', type=str, metavar='EXPR',
+                            help='Use downsampling in self-attention [True, ...]')
+        parser.add_argument('--pretrained-checkpoint', metavar='DIR',
+                            help='path to load checkpoint from pretrained model')
+        parser.add_argument('--pretrained', type=str, metavar='EXPR',
+                            help='use pretrained model when training [True, ...]')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        trained_encoder, trained_decoder = None, None
+        pretrained = eval(args.pretrained)
+        if pretrained:
+            logger.info("loading pretrained model")
+            if not os.path.exists(args.pretrained_checkpoint):
+                new_pretrained_checkpoint = os.path.join(
+                    args.data, args.pretrained_checkpoint
+                )
+                if os.path.exists(new_pretrained_checkpoint):
+                    args.pretrained_checkpoint = new_pretrained_checkpoint
+            trained_model = checkpoint_utils.load_model_ensemble(
+                filenames=[args.pretrained_checkpoint],
+                task=task,
+            )[0][0]
+            trained_decoder = list(trained_model.children())[1]
+            trained_encoder = list(trained_model.children())[0]
+
+            # freeze pretrained model
+            for param in trained_decoder.parameters():
+                param.requires_grad = False
+            for param in trained_encoder.parameters():
+                param.requires_grad = False
+
+        encoder = FConvEncoder(
+            task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            convolutions=eval(args.encoder_layers),
+            dropout=args.dropout,
+            max_positions=args.max_source_positions,
+            attention=eval(args.encoder_attention),
+            attention_nheads=args.encoder_attention_nheads,
+        )
+
+        decoder = FConvDecoder(
+            task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_out_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.max_target_positions,
+            selfattention=eval(args.self_attention),
+            attention_nheads=args.multihead_attention_nheads,
+            selfattention_nheads=args.multihead_self_attention_nheads,
+            project_input=eval(args.project_input),
+            gated_attention=eval(args.gated_attention),
+            downsample=eval(args.downsample),
+            pretrained=pretrained,
+            trained_decoder=trained_decoder,
+        )
+        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)
+
+        return model
+
+    @property
+    def pretrained(self):
+        return self.pretrained_encoder is not None
+
+
+class FConvEncoder(FairseqEncoder):
+    """Convolutional encoder"""
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        max_positions=1024,
+        convolutions=((512, 3),) * 20,
+        dropout=0.1,
+        attention=False,
+        attention_nheads=1,
+    ):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.num_attention_layers = None
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+
+        def expand_bool_array(val):
+            if isinstance(val, bool):
+                # expand True into [True, True, ...] and do the same with False
+                return [val] * len(convolutions)
+            return val
+
+        attention = expand_bool_array(attention)
+
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.attproj = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            self.projections.append(
+                Linear(in_channels, out_channels)
+                if in_channels != out_channels
+                else None
+            )
+            self.convolutions.append(
+                ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)
+            )
+
+            self.attention.append(
+                SelfAttention(out_channels, embed_dim, attention_nheads)
+                if attention[i]
+                else None
+            )
+            in_channels = out_channels
+
+        self.fc2 = Linear(in_channels, embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        # embed tokens and positions
+        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
+        x = self.dropout_module(x)
+        input_embedding = x.transpose(0, 1)
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # temporal convolutions
+        for proj, conv, attention in zip(
+            self.projections, self.convolutions, self.attention
+        ):
+            residual = x if proj is None else proj(x)
+
+            if encoder_padding_mask is not None:
+                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+            x = self.dropout_module(x)
+            padding_l = (conv.kernel_size[0] - 1) // 2
+            padding_r = conv.kernel_size[0] // 2
+            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
+            x = conv(x)
+            x = F.glu(x, dim=2)
+            if attention is not None:
+                x = attention(x)
+            x = (x + residual) * math.sqrt(0.5)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # project back to size of embedding
+        x = self.fc2(x)
+
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
+            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+        # scale gradients (this only affects backward, not forward)
+        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
+
+        # add output to input embedding for attention
+        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)
+
+        return {
+            "encoder_out": (x, y),
+            "encoder_padding_mask": encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out["encoder_out"] = tuple(
+            eo.index_select(0, new_order) for eo in encoder_out["encoder_out"]
+        )
+
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(0, new_order)
+
+        if "pretrained" in encoder_out:
+            encoder_out["pretrained"]["encoder_out"] = tuple(
+                eo.index_select(0, new_order)
+                for eo in encoder_out["pretrained"]["encoder_out"]
+            )
+
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.embed_positions.max_positions
+
+
+@with_incremental_state
+class FConvDecoder(FairseqDecoder):
+    """Convolutional decoder"""
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        out_embed_dim=256,
+        max_positions=1024,
+        convolutions=((512, 3),) * 8,
+        attention=True,
+        dropout=0.1,
+        selfattention=False,
+        attention_nheads=1,
+        selfattention_nheads=1,
+        project_input=False,
+        gated_attention=False,
+        downsample=False,
+        pretrained=False,
+        trained_decoder=None,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([2]))
+        self.pretrained = pretrained
+        self.pretrained_decoder = trained_decoder
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.need_attn = True
+        in_channels = convolutions[0][0]
+
+        def expand_bool_array(val):
+            if isinstance(val, bool):
+                # expand True into [True, True, ...] and do the same with False
+                return [val] * len(convolutions)
+            return val
+
+        attention = expand_bool_array(attention)
+        selfattention = expand_bool_array(selfattention)
+
+        if not isinstance(attention, list) or len(attention) != len(convolutions):
+            raise ValueError(
+                "Attention is expected to be a list of booleans of "
+                "length equal to the number of layers."
+            )
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            padding_idx,
+        )
+
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.selfattention = nn.ModuleList()
+        self.attproj = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            self.projections.append(
+                Linear(in_channels, out_channels)
+                if in_channels != out_channels
+                else None
+            )
+            self.convolutions.append(
+                LinearizedConv1d(
+                    in_channels,
+                    out_channels * 2,
+                    kernel_size,
+                    padding=(kernel_size - 1),
+                    dropout=dropout,
+                )
+            )
+
+            self.attention.append(
+                DownsampledMultiHeadAttention(
+                    out_channels,
+                    embed_dim,
+                    attention_nheads,
+                    project_input=project_input,
+                    gated=False,
+                    downsample=False,
+                )
+                if attention[i]
+                else None
+            )
+
+            self.attproj.append(
+                Linear(out_channels, embed_dim, dropout=dropout)
+                if attention[i]
+                else None
+            )
+            self.selfattention.append(
+                SelfAttention(
+                    out_channels,
+                    embed_dim,
+                    selfattention_nheads,
+                    project_input=project_input,
+                    gated=gated_attention,
+                    downsample=downsample,
+                )
+                if selfattention[i]
+                else None
+            )
+            in_channels = out_channels
+
+        self.fc2 = Linear(in_channels, out_embed_dim)
+        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+
+        # model fusion
+        if self.pretrained:
+            # independent gates are learned from the concatenated input
+            self.gate1 = nn.Sequential(
+                Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid()
+            )
+            self.gate2 = nn.Sequential(
+                Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid()
+            )
+            # pretrained and trained models are joined
+            self.joining = nn.Sequential(
+                Linear(out_embed_dim * 2, out_embed_dim * 2),
+                LayerNorm(out_embed_dim * 2),
+                nn.GLU(),
+                Linear(out_embed_dim, out_embed_dim * 2),
+                LayerNorm(out_embed_dim * 2),
+                nn.GLU(),
+                Linear(out_embed_dim, out_embed_dim),
+                LayerNorm(out_embed_dim),
+            )
+            # pretrained model contains an output layer that is nhid -> vocab size
+            # but the models are combined in their hidden state
+            # the hook stores the output of the pretrained model forward
+            self.pretrained_outputs = {}
+
+            def save_output():
+                def hook(a, b, output):
+                    self.pretrained_outputs["out"] = output
+
+                return hook
+
+            self.pretrained_decoder.fc2.register_forward_hook(save_output())
+
+    def forward(self, prev_output_tokens, encoder_out):
+        trained_encoder_out = encoder_out["pretrained"] if self.pretrained else None
+        encoder_out = encoder_out["encoder"]["encoder_out"]
+
+        encoder_a, encoder_b = self._split_encoder_out(encoder_out)
+
+        # embed positions
+        positions = self.embed_positions(prev_output_tokens)
+
+        # embed tokens and positions
+        x = self.embed_tokens(prev_output_tokens) + positions
+        x = self.dropout_module(x)
+        target_embedding = x.transpose(0, 1)
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # temporal convolutions
+        avg_attn_scores = None
+        for proj, conv, attention, selfattention, attproj in zip(
+            self.projections,
+            self.convolutions,
+            self.attention,
+            self.selfattention,
+            self.attproj,
+        ):
+            residual = x if proj is None else proj(x)
+
+            x = self.dropout_module(x)
+            x = conv(x)
+            x = F.glu(x, dim=2)
+
+            # attention
+            if attention is not None:
+                r = x
+                x, attn_scores = attention(
+                    attproj(x) + target_embedding, encoder_a, encoder_b
+                )
+                x = x + r
+                if not self.training and self.need_attn:
+                    if avg_attn_scores is None:
+                        avg_attn_scores = attn_scores
+                    else:
+                        avg_attn_scores.add_(attn_scores)
+
+            if selfattention is not None:
+                x = selfattention(x)
+
+            x = (x + residual) * math.sqrt(0.5)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        # project back to size of vocabulary
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if not self.pretrained:
+            x = self.fc3(x)
+
+        # fusion gating
+        if self.pretrained:
+            trained_x, _ = self.pretrained_decoder.forward(
+                prev_output_tokens, trained_encoder_out
+            )
+            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
+            gate1 = self.gate1(y)
+            gate2 = self.gate2(y)
+            gated_x1 = gate1 * x
+            gated_x2 = gate2 * self.pretrained_outputs["out"]
+            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
+            fusion = self.joining(fusion)
+            fusion_output = self.fc3(fusion)
+            return fusion_output, avg_attn_scores
+        else:
+            return x, avg_attn_scores
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.embed_positions.max_positions
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def _split_encoder_out(self, encoder_out):
+        """Split and transpose encoder outputs."""
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(0, 1).contiguous()
+        encoder_b = encoder_b.transpose(0, 1).contiguous()
+        result = (encoder_a, encoder_b)
+        return result
+
+
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        embed_dim,
+        num_heads,
+        project_input=False,
+        gated=False,
+        downsample=False,
+    ):
+        super().__init__()
+        self.attention = DownsampledMultiHeadAttention(
+            out_channels,
+            embed_dim,
+            num_heads,
+            dropout=0,
+            bias=True,
+            project_input=project_input,
+            gated=gated,
+            downsample=downsample,
+        )
+        self.in_proj_q = Linear(out_channels, embed_dim)
+        self.in_proj_k = Linear(out_channels, embed_dim)
+        self.in_proj_v = Linear(out_channels, embed_dim)
+        self.ln = LayerNorm(out_channels)
+
+    def forward(self, x):
+        residual = x
+        query = self.in_proj_q(x)
+        key = self.in_proj_k(x)
+        value = self.in_proj_v(x)
+        x, _ = self.attention(
+            query, key, value, mask_future_timesteps=True, use_scalar_bias=True
+        )
+        return self.ln(x + residual)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    m.weight.data.normal_(0, 0.1)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
+    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+    m.weight.data.normal_(0, 0.1)
+    return m
+
+
+def Linear(in_features, out_features, dropout=0.0):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+    m.bias.data.zero_()
+    return m
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return m
+
+
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return m
+
+
+@register_model_architecture("fconv_self_att", "fconv_self_att")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 3")
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 8")
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
+    args.decoder_attention = getattr(args, "decoder_attention", "True")
+    args.self_attention = getattr(args, "self_attention", "False")
+    args.encoder_attention = getattr(args, "encoder_attention", "False")
+    args.multihead_attention_nheads = getattr(args, "multihead_attention_nheads", 1)
+    args.multihead_self_attention_nheads = getattr(
+        args, "multihead_self_attention_nheads", 1
+    )
+    args.encoder_attention_nheads = getattr(args, "encoder_attention_nheads", 1)
+    args.project_input = getattr(args, "project_input", "False")
+    args.gated_attention = getattr(args, "gated_attention", "False")
+    args.downsample = getattr(args, "downsample", "False")
+    args.pretrained_checkpoint = getattr(args, "pretrained_checkpoint", "")
+    args.pretrained = getattr(args, "pretrained", "False")
+
+
+@register_model_architecture("fconv_self_att", "fconv_self_att_wp")
+def fconv_self_att_wp(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_layers = getattr(
+        args, "encoder_layers", "[(128, 3)] * 2 + [(512,3)] * 1"
+    )
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_layers = getattr(
+        args, "decoder_layers", "[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1"
+    )
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
+    args.self_attention = getattr(args, "self_attention", "True")
+    args.multihead_self_attention_nheads = getattr(
+        args, "multihead_self_attention_nheads", 4
+    )
+    args.project_input = getattr(args, "project_input", "True")
+    args.gated_attention = getattr(args, "gated_attention", "True")
+    args.downsample = getattr(args, "downsample", "True")
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
new file mode 100644
index 0000000000..a1b0eabbdb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .hubert import *  # noqa
+from .hubert_asr import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
new file mode 100644
index 0000000000..0c0349eb2f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
@@ -0,0 +1,570 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from omegaconf import II
+
+from fairseq import utils
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.data.dictionary import Dictionary
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec.wav2vec2 import (
+    EXTRACTOR_MODE_CHOICES,
+    MASKING_DISTRIBUTION_CHOICES,
+    LAYER_TYPE_CHOICES,
+    ConvFeatureExtractionModel,
+    TransformerEncoder,
+)
+from fairseq.modules import GradMultiply, LayerNorm
+from fairseq.tasks.hubert_pretraining import (
+    HubertPretrainingConfig,
+    HubertPretrainingTask,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class HubertConfig(FairseqDataclass):
+    label_rate: float = II("task.label_rate")
+
+    extractor_mode: EXTRACTOR_MODE_CHOICES = field(
+        default="default",
+        metadata={
+            "help": "mode for feature extractor. default has a single group "
+            "norm with d groups in the first conv block, whereas layer_norm "
+            "has layer norms in every block (meant to use with normalize=True)"
+        },
+    )
+    encoder_layers: int = field(
+        default=12, metadata={"help": "num encoder layers in the transformer"}
+    )
+    encoder_embed_dim: int = field(
+        default=768, metadata={"help": "encoder embedding dimension"}
+    )
+    encoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "encoder embedding dimension for FFN"}
+    )
+    encoder_attention_heads: int = field(
+        default=12, metadata={"help": "num encoder attention heads"}
+    )
+    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
+        default="gelu", metadata={"help": "activation function to use"}
+    )
+    layer_type: LAYER_TYPE_CHOICES = field(
+        default="transformer", metadata={"help": "layer type in encoder"}
+    )
+
+    # dropouts
+    dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for the transformer"},
+    )
+    attention_dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for attention weights"},
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout probability after activation in FFN"},
+    )
+    encoder_layerdrop: float = field(
+        default=0.0,
+        metadata={"help": "probability of dropping a tarnsformer layer"},
+    )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"},
+    )
+    dropout_features: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the features (after feat extr)"},
+    )
+
+    final_dim: int = field(
+        default=0,
+        metadata={
+            "help": "project final representations and targets to this many "
+            "dimensions. set to encoder_embed_dim is <= 0"
+        },
+    )
+    untie_final_proj: bool = field(
+        default=False,
+        metadata={"help": "use separate projection for each target"},
+    )
+    layer_norm_first: bool = field(
+        default=False,
+        metadata={"help": "apply layernorm first in the transformer"},
+    )
+    conv_feature_layers: str = field(
+        default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        metadata={
+            "help": "string describing convolutional feature extraction "
+            "layers in form of a python list that contains "
+            "[(dim, kernel_size, stride), ...]"
+        },
+    )
+    conv_bias: bool = field(
+        default=False, metadata={"help": "include bias in conv encoder"}
+    )
+    logit_temp: float = field(
+        default=0.1, metadata={"help": "temperature to divide logits by"}
+    )
+    target_glu: bool = field(
+        default=False, metadata={"help": "adds projection + glu to targets"}
+    )
+    feature_grad_mult: float = field(
+        default=1.0,
+        metadata={"help": "multiply feature extractor var grads by this"},
+    )
+
+    # masking
+    mask_length: int = field(default=10, metadata={"help": "mask length"})
+    mask_prob: float = field(
+        default=0.65,
+        metadata={"help": "probability of replacing a token with mask"},
+    )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose mask length"}
+    )
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        },
+    )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"}
+    )
+    mask_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+
+    # channel masking
+    mask_channel_length: int = field(
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"},
+    )
+    mask_channel_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"},
+    )
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"},
+    )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        },
+    )
+    no_mask_channel_overlap: bool = field(
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"},
+    )
+    mask_channel_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+
+    # positional embeddings
+    conv_pos: int = field(
+        default=128,
+        metadata={"help": "number of filters for convolutional positional embeddings"},
+    )
+    conv_pos_groups: int = field(
+        default=16,
+        metadata={"help": "number of groups for convolutional positional embedding"},
+    )
+
+    latent_temp: Tuple[float, float, float] = field(
+        default=(2, 0.5, 0.999995),
+        metadata={"help": "legacy (to be removed)"},
+    )
+
+    # loss computation
+    skip_masked: bool = field(
+        default=False,
+        metadata={"help": "skip computing losses over masked frames"},
+    )
+    skip_nomask: bool = field(
+        default=False,
+        metadata={"help": "skip computing losses over unmasked frames"},
+    )
+
+    checkpoint_activations: bool = field(
+        default=False,
+        metadata={"help": "recompute activations and save memory for extra compute"},
+    )
+
+    # FP16 optimization
+    required_seq_len_multiple: int = field(
+        default=2,
+        metadata={
+            "help": "pad the input to encoder such that the sequence length is divisible by multiple"
+        },
+    )
+
+    # Conformer
+    depthwise_conv_kernel_size: int = field(
+        default=31,
+        metadata={
+            "help": "depthwise-conv-kernel-size for convolution in conformer layer"
+        },
+    )
+    attn_type: str = field(
+        default="",
+        metadata={"help": "if espnet use ESPNET MHA"},
+    )
+    pos_enc_type: str = field(
+        default="abs",
+        metadata={"help": "Positional encoding type to use in conformer"},
+    )
+    fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"})
+
+
+@register_model("hubert", dataclass=HubertConfig)
+class HubertModel(BaseFairseqModel):
+    def __init__(
+        self,
+        cfg: HubertConfig,
+        task_cfg: HubertPretrainingConfig,
+        dictionaries: List[Dictionary],
+    ) -> None:
+        super().__init__()
+        logger.info(f"HubertModel Config: {cfg}")
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)  # noqa
+        self.embed = feature_enc_layers[-1][0]
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+        feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers])
+        self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate
+
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim
+            else None
+        )
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.logit_temp = cfg.logit_temp
+        self.skip_masked = cfg.skip_masked
+        self.skip_nomask = cfg.skip_nomask
+
+        final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
+
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+
+        self.target_glu = None
+        if cfg.target_glu:
+            self.target_glu = nn.Sequential(
+                nn.Linear(final_dim, final_dim * 2), nn.GLU()
+            )
+
+        self.untie_final_proj = cfg.untie_final_proj
+        if self.untie_final_proj:
+            self.final_proj = nn.Linear(
+                cfg.encoder_embed_dim, final_dim * len(dictionaries)
+            )
+        else:
+            self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim)
+
+        # modules below are not needed during fine-tuning
+        if any([d is None for d in dictionaries]):
+            logger.info("cannot find dictionary. assume will be used for fine-tuning")
+        else:
+            self.num_classes = [len(d) for d in dictionaries]
+            self.label_embs_concat = nn.Parameter(
+                torch.FloatTensor(sum(self.num_classes), final_dim)
+            )
+            nn.init.uniform_(self.label_embs_concat)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+
+    @classmethod
+    def build_model(cls, cfg: HubertConfig, task: HubertPretrainingTask):
+        """Build a new model instance."""
+
+        model = HubertModel(cfg, task.cfg, task.dictionaries)
+        return model
+
+    def apply_mask(self, x, padding_mask, target_list):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x[mask_indices] = self.mask_emb
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+    def compute_nce(self, x, pos, negs):
+        neg_is_pos = (pos == negs).all(-1)
+        pos = pos.unsqueeze(0)
+        targets = torch.cat([pos, negs], dim=0)
+
+        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x)
+        logits /= self.logit_temp
+        if neg_is_pos.any():
+            logits[1:][neg_is_pos] = float("-inf")
+        logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
+        return logits
+
+    def forward_features(self, source: torch.Tensor) -> torch.Tensor:
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(source)
+        return features
+
+    def forward_targets(
+        self,
+        features: torch.Tensor,
+        target_list: List[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Trim features to ensure labels exist and then get aligned labels
+        feat_tsz = features.size(2)
+        targ_tsz = min([t.size(1) for t in target_list])
+        if self.feat2tar_ratio * feat_tsz > targ_tsz:
+            feat_tsz = int(targ_tsz / self.feat2tar_ratio)
+            features = features[..., :feat_tsz]
+        target_inds = torch.arange(feat_tsz).float() * self.feat2tar_ratio
+        target_list = [t[:, target_inds.long()] for t in target_list]
+        return features, target_list
+
+    def forward_padding_mask(
+        self,
+        features: torch.Tensor,
+        padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+
+    def forward(
+        self,
+        source: torch.Tensor,
+        target_list: Optional[List[torch.Tensor]] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        mask: bool = True,
+        features_only: bool = False,
+        output_layer: Optional[int] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """output layer is 1-based"""
+        features = self.forward_features(source)
+        if target_list is not None:
+            features, target_list = self.forward_targets(features, target_list)
+
+        features_pen = features.float().pow(2).mean()
+
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        unmasked_features = features.clone()
+
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(features, padding_mask, target_list)
+        else:
+            x = features
+            mask_indices = None
+
+        # feature: (B, T, D), float
+        # target: (B, T), long
+        # x: (B, T, D), float
+        # padding_mask: (B, T), bool
+        # mask_indices: (B, T), bool
+        x, _ = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=None if output_layer is None else output_layer - 1,
+        )
+
+        if features_only:
+            return {"x": x, "padding_mask": padding_mask, "features": features}
+
+        def compute_pred(proj_x, target, label_embs):
+            # compute logits for the i-th label set
+            y = torch.index_select(label_embs, 0, target.long())
+            negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1)
+            if self.target_glu:
+                y = self.target_glu(y)
+                negs = self.target_glu(negs)
+            # proj_x: (S, D)
+            # y: (S, D)
+            # negs: (Neg, S, D)
+            return self.compute_nce(proj_x, y, negs)
+
+        label_embs_list = self.label_embs_concat.split(self.num_classes, 0)
+
+        if not self.skip_masked:
+            masked_indices = torch.logical_and(~padding_mask, mask_indices)
+            proj_x_m = self.final_proj(x[masked_indices])
+            if self.untie_final_proj:
+                proj_x_m_list = proj_x_m.chunk(len(target_list), dim=-1)
+            else:
+                proj_x_m_list = [proj_x_m for _ in range(len(target_list))]
+            logit_m_list = [
+                compute_pred(proj_x_m, t[masked_indices], label_embs_list[i])
+                for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list))
+            ]
+        else:
+            logit_m_list = [None for _ in target_list]
+
+        if not self.skip_nomask:
+            nomask_indices = torch.logical_and(~padding_mask, ~mask_indices)
+            proj_x_u = self.final_proj(x[nomask_indices])
+            if self.untie_final_proj:
+                proj_x_u_list = proj_x_u.chunk(len(target_list), dim=-1)
+            else:
+                proj_x_u_list = [proj_x_u for _ in range(len(target_list))]
+
+            logit_u_list = [
+                compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i])
+                for i, (proj_x_u, t) in enumerate(zip(proj_x_u_list, target_list))
+            ]
+        else:
+            logit_u_list = [None for _ in target_list]
+
+        result = {
+            "logit_m_list": logit_m_list,
+            "logit_u_list": logit_u_list,
+            "padding_mask": padding_mask,
+            "features_pen": features_pen,
+        }
+        return result
+
+    def extract_features(
+        self,
+        source: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        mask: bool = False,
+        ret_conv: bool = False,
+        output_layer: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        res = self.forward(
+            source,
+            padding_mask=padding_mask,
+            mask=mask,
+            features_only=True,
+            output_layer=output_layer,
+        )
+        feature = res["features"] if ret_conv else res["x"]
+        return feature, res["padding_mask"]
+
+    def get_logits(self, net_output, is_masked=True):
+        if is_masked:
+            logits_list = net_output["logit_m_list"]
+        else:
+            logits_list = net_output["logit_u_list"]
+        logits_list = [x.float() for x in logits_list if x is not None]
+        return logits_list
+
+    def get_targets(self, net_output, is_masked=True):
+        logits_list = self.get_logits(net_output, is_masked)
+        targets_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logits_list]
+        return targets_list
+
+    def get_extra_losses(self, net_output):
+        extra_losses = []
+        names = []
+
+        if "features_pen" in net_output:
+            extra_losses.append(net_output["features_pen"])
+            names.append("features_pen")
+
+        return extra_losses, names
+
+    def remove_pretraining_modules(self):
+        self.target_glu = None
+        self.final_proj = None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
new file mode 100644
index 0000000000..11c85ce7d1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
@@ -0,0 +1,675 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import copy
+import logging
+import math
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Any, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from omegaconf import II, MISSING, open_dict
+
+from fairseq import checkpoint_utils, tasks, utils
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.models import (
+    BaseFairseqModel,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+)
+from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES
+from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer
+from fairseq.tasks import FairseqTask
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class HubertAsrConfig(FairseqDataclass):
+    w2v_path: str = field(default=MISSING, metadata={"help": "path to hubert model"})
+    no_pretrained_weights: bool = field(
+        default=False,
+        metadata={"help": "if true, does not load pretrained weights"},
+    )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"},
+    )
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout after transformer and before final projection"},
+    )
+    dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout probability inside hubert model"},
+    )
+    attention_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability for attention weights " "inside hubert model"
+        },
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN " "inside hubert model"
+        },
+    )
+    encoder_embed_dim: Optional[int] = field(
+        default=768, metadata={"help": "encoder embedding dimension"}
+    )
+
+    # masking
+    apply_mask: bool = field(
+        default=False, metadata={"help": "apply masking during fine-tuning"}
+    )
+    mask_length: int = field(
+        default=10, metadata={"help": "repeat the mask indices multiple times"}
+    )
+    mask_prob: float = field(
+        default=0.5,
+        metadata={
+            "help": "probability of replacing a token with mask "
+            "(normalized by length)"
+        },
+    )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose masks"}
+    )
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indices"
+        },
+    )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"}
+    )
+
+    # channel masking
+    mask_channel_length: int = field(
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"},
+    )
+    mask_channel_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"},
+    )
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"},
+    )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indices"
+        },
+    )
+    no_mask_channel_overlap: bool = field(
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"},
+    )
+    freeze_finetune_updates: int = field(
+        default=0,
+        metadata={"help": "dont finetune hubert for this many updates"},
+    )
+    feature_grad_mult: float = field(
+        default=0.0,
+        metadata={"help": "reset feature grad mult in hubert to this"},
+    )
+    layerdrop: float = field(
+        default=0.0,
+        metadata={"help": "probability of dropping a layer in hubert"},
+    )
+    normalize: bool = II("task.normalize")
+    data: str = II("task.data")
+
+    # this holds the loaded hubert args
+    w2v_args: Any = None
+
+
+@dataclass
+class HubertCtcConfig(HubertAsrConfig):
+    pass
+
+
+@register_model("hubert_ctc", dataclass=HubertCtcConfig)
+class HubertCtc(BaseFairseqModel):
+    def __init__(self, cfg: HubertCtcConfig, w2v_encoder: BaseFairseqModel):
+        super().__init__()
+        self.cfg = cfg
+        self.w2v_encoder = w2v_encoder
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+
+    @classmethod
+    def build_model(cls, cfg: HubertCtcConfig, task: FairseqTask):
+        """Build a new model instance."""
+        w2v_encoder = HubertEncoder(cfg, task)
+        return cls(cfg, w2v_encoder)
+
+    def get_normalized_probs(self, net_output, log_probs):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        logits = net_output["encoder_out"]
+        if log_probs:
+            return utils.log_softmax(logits.float(), dim=-1)
+        else:
+            return utils.softmax(logits.float(), dim=-1)
+
+    def get_logits(self, net_output):
+        logits = net_output["encoder_out"]
+        padding = net_output["encoder_padding_mask"]
+        if padding is not None and padding.any():
+            padding = padding.T
+            logits[padding][..., 0] = 0
+            logits[padding][..., 1:] = float("-inf")
+
+        return logits
+
+    def forward(self, **kwargs):
+        x = self.w2v_encoder(**kwargs)
+        return x
+
+
+@dataclass
+class HubertSeq2SeqConfig(HubertAsrConfig):
+    decoder_embed_dim: int = field(
+        default=768, metadata={"help": "decoder embedding dimension"}
+    )
+    decoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "decoder embedding dimension for FFN"}
+    )
+    decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"})
+    decoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "decoder layerdrop chance"}
+    )
+    decoder_attention_heads: int = field(
+        default=4, metadata={"help": "num decoder attention heads"}
+    )
+    decoder_learned_pos: bool = field(
+        default=False,
+        metadata={"help": "use learned positional embeddings in the decoder"},
+    )
+    decoder_normalize_before: bool = field(
+        default=False, metadata={"help": "apply layernorm before each decoder block"}
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, disables positional embeddings (outside self attention)"
+        },
+    )
+    decoder_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability in the decoder"}
+    )
+    decoder_attention_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability for attention weights inside the decoder"
+        },
+    )
+    decoder_activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN inside the decoder"
+        },
+    )
+    max_target_positions: int = field(
+        default=2048, metadata={"help": "max target positions"}
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False, metadata={"help": "share decoder input and output embeddings"}
+    )
+    autoregressive: bool = II("task.autoregressive")
+    seq2seq_path: str = field(
+        default="",
+        metadata={"help": "reset_dict"},
+    )
+    reset_dict: bool = field(
+        default=False,
+        metadata={"help": "reset_dict"},
+    )
+
+
+@register_model("hubert_seq2seq", dataclass=HubertSeq2SeqConfig)
+class HubertSeq2SeqModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def build_model(cls, cfg: HubertSeq2SeqConfig, task: FairseqTask):
+        """Build a new model instance."""
+
+        assert (
+            cfg.autoregressive
+        ), "Please set task.autoregressive=true for seq2seq asr models"
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            return emb
+
+        decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim)
+
+        encoder = cls.build_encoder(cfg, task)
+        decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens)
+
+        model = HubertSeq2SeqModel(encoder, decoder)
+
+        if cfg["seq2seq_path"]:
+            state = checkpoint_utils.load_checkpoint_to_cpu(cfg.seq2seq_path)
+            state = state["model"]
+            if cfg["reset_dict"]:
+                del state["decoder.embed_out"]
+                del state["decoder.embed_tokens.weight"]
+            model.load_state_dict(state, strict=False)
+        return model
+
+    @classmethod
+    def build_encoder(cls, cfg: HubertAsrConfig, task):
+        return HubertEncoder(cfg, task)
+
+    @classmethod
+    def build_decoder(cls, cfg: HubertSeq2SeqConfig, tgt_dict, embed_tokens):
+        return TransformerDecoder(cfg, tgt_dict, embed_tokens)
+
+    def forward(self, **kwargs):
+        encoder_out = self.encoder(**kwargs)
+        decoder_out = self.decoder(encoder_out=encoder_out, **kwargs)
+        return decoder_out
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+    def load_state_dict(
+        self,
+        state_dict,
+        strict=True,
+        model_cfg=None,
+        args: Optional[Namespace] = None,
+    ):
+        if model_cfg.reset_dict:
+            logger.warn("Overriding loading strict state dict!")
+            del state_dict["decoder.embed_out"]
+            del state_dict["decoder.embed_tokens.weight"]
+            return super().load_state_dict(state_dict, False, model_cfg, args)
+        return super().load_state_dict(state_dict, strict, model_cfg, args)
+
+
+class HubertEncoder(FairseqEncoder):
+    def __init__(self, cfg: HubertAsrConfig, task):
+        self.apply_mask = cfg.apply_mask
+
+        arg_overrides = {
+            "dropout": cfg.dropout,
+            "activation_dropout": cfg.activation_dropout,
+            "dropout_input": cfg.dropout_input,
+            "attention_dropout": cfg.attention_dropout,
+            "mask_length": cfg.mask_length,
+            "mask_prob": cfg.mask_prob,
+            "mask_selection": cfg.mask_selection,
+            "mask_other": cfg.mask_other,
+            "no_mask_overlap": cfg.no_mask_overlap,
+            "mask_channel_length": cfg.mask_channel_length,
+            "mask_channel_prob": cfg.mask_channel_prob,
+            "mask_channel_selection": cfg.mask_channel_selection,
+            "mask_channel_other": cfg.mask_channel_other,
+            "no_mask_channel_overlap": cfg.no_mask_channel_overlap,
+            "encoder_layerdrop": cfg.layerdrop,
+            "feature_grad_mult": cfg.feature_grad_mult,
+        }
+
+        if cfg.w2v_args is None:
+            state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides)
+            w2v_args = state.get("cfg", None)
+            if w2v_args is None:
+                w2v_args = convert_namespace_to_omegaconf(state["args"])
+            cfg.w2v_args = w2v_args
+        else:
+            state = None
+            w2v_args = cfg.w2v_args
+            if isinstance(w2v_args, Namespace):
+                cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args)
+
+        assert cfg.normalize == w2v_args.task.normalize, (
+            "Fine-tuning works best when data normalization is the same. "
+            "Please check that --normalize is set or unset for "
+            "both pre-training and here"
+        )
+
+        w2v_args.task.data = cfg.data
+        pretrain_task = tasks.setup_task(w2v_args.task)
+        if state is not None and "task_state" in state:
+            # This will load the stored "dictionaries" object
+            pretrain_task.load_state_dict(state["task_state"])
+        else:
+            pretrain_task.load_state_dict(task.state_dict())
+
+        model = pretrain_task.build_model(w2v_args.model, from_checkpoint=True)
+        if state is not None and not cfg.no_pretrained_weights:
+            # set strict=False because we omit some modules
+            model.load_state_dict(state["model"], strict=False)
+
+        model.remove_pretraining_modules()
+
+        super().__init__(pretrain_task.source_dictionary)
+
+        d = w2v_args.model.encoder_embed_dim
+
+        self.w2v_model = model
+
+        self.final_dropout = nn.Dropout(cfg.final_dropout)
+        self.freeze_finetune_updates = cfg.freeze_finetune_updates
+        self.num_updates = 0
+
+        if task.target_dictionary is not None and not cfg.autoregressive:
+            self.proj = Linear(d, len(task.target_dictionary))
+        elif getattr(cfg, "decoder_embed_dim", d) != d:
+            self.proj = Linear(d, cfg.decoder_embed_dim)
+        else:
+            self.proj = None
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+
+    def forward(self, source, padding_mask, tbc=True, **kwargs):
+
+        w2v_args = {
+            "source": source,
+            "padding_mask": padding_mask,
+            "mask": self.apply_mask and self.training,
+        }
+
+        ft = self.freeze_finetune_updates <= self.num_updates
+
+        with torch.no_grad() if not ft else contextlib.ExitStack():
+            x, padding_mask = self.w2v_model.extract_features(**w2v_args)
+
+            if tbc:
+                # B x T x C -> T x B x C
+                x = x.transpose(0, 1)
+
+        x = self.final_dropout(x)
+
+        if self.proj:
+            x = self.proj(x)
+
+        return {
+            "encoder_out": x,  # T x B x C
+            "encoder_padding_mask": padding_mask,  # B x T
+            "padding_mask": padding_mask,
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out["encoder_out"] is not None:
+            encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+                1, new_order
+            )
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(0, new_order)
+        if encoder_out["padding_mask"] is not None:
+            encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select(
+                0, new_order
+            )
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return None
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self,
+        cfg: HubertSeq2SeqConfig,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+    ):
+        super().__init__(dictionary)
+
+        self.dropout = cfg.decoder_dropout
+        self.share_input_output_embed = cfg.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = cfg.decoder_embed_dim
+        self.output_embed_dim = cfg.decoder_embed_dim
+
+        self.layerdrop = cfg.decoder_layerdrop
+
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = cfg.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                cfg.max_target_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=cfg.decoder_learned_pos,
+            )
+            if not cfg.no_token_positional_embeddings
+            else None
+        )
+
+        # TODO: update this when transformer gets converted to dataclass configs
+        transformer_cfg = copy.deepcopy(cfg)
+        with open_dict(transformer_cfg):
+            transformer_cfg.dropout = transformer_cfg.decoder_dropout
+            transformer_cfg.attention_dropout = (
+                transformer_cfg.decoder_attention_dropout
+            )
+            transformer_cfg.activation_dropout = (
+                transformer_cfg.decoder_activation_dropout
+            )
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                TransformerDecoderLayer(transformer_cfg, no_encoder_attn)
+                for _ in range(transformer_cfg.decoder_layers)
+            ]
+        )
+
+        if not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(
+                torch.Tensor(len(dictionary), self.output_embed_dim)
+            )
+            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5)
+
+        if transformer_cfg.decoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        if type(prev_output_tokens) == list:
+            max_len = max((len(x) for x in prev_output_tokens))
+            tmp = torch.zeros(
+                [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device
+            )
+            for (i, p) in enumerate(prev_output_tokens):
+                tmp[i, : len(p)] = p
+            prev_output_tokens = tmp
+        prev_output_tokens = prev_output_tokens.long()
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state
+        )
+        x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+            if self.embed_positions is not None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+
+        inner_states = [x]
+
+        # decoder layers
+        self_attn_padding_mask = None
+        if prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        for layer in self.layers:
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, attn, _ = layer(
+                    x,
+                    encoder_out["encoder_out"] if encoder_out is not None else None,
+                    encoder_out["padding_mask"] if encoder_out is not None else None,
+                    incremental_state,
+                    self_attn_mask=self.buffered_future_mask(x)
+                    if incremental_state is None
+                    else None,
+                    self_attn_padding_mask=self_attn_padding_mask,
+                )
+                inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        # project back to size of vocabulary
+        if self.share_input_output_embed:
+            return F.linear(features, self.embed_tokens.weight)
+        else:
+            return F.linear(features, self.embed_out)
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
new file mode 100644
index 0000000000..f7911c2c8e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+import os
+
+
+# automatically import any Python files in the models/huggingface/ directory
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if (
+        not file.startswith("_")
+        and not file.startswith(".")
+        and (file.endswith(".py") or os.path.isdir(path))
+    ):
+        model_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module("fairseq.models.huggingface." + model_name)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
new file mode 100644
index 0000000000..3a8eb78198
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+from typing import Dict, List, Optional
+
+import torch
+from fairseq.models import (
+    FairseqIncrementalDecoder,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model("hf_gpt2")
+class HuggingFaceGPT2LanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--embed-dim', type=int, metavar='N',
+                            help='embedding dimension')
+        parser.add_argument('--num-attention-heads', type=int, metavar='N',
+                            help='num attention heads')
+        parser.add_argument('--num-layers', type=int, metavar='N',
+                            help='num layers')
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability for all fully connected layers '
+                                 'in the embeddings, encoder, and pooler')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        default_architecture(args)
+        return cls(HuggingFaceGPT2Decoder(args, task))
+
+
+class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder):
+    def __init__(self, args, task):
+        try:
+            from transformers import GPT2Config, GPT2LMHeadModel
+        except ImportError:
+            raise ImportError(
+                "\n\nPlease install huggingface/transformers with:"
+                "\n\n  pip install transformers"
+            )
+
+        super().__init__(task.target_dictionary)
+
+        config = GPT2Config(
+            vocab_size=len(task.target_dictionary),
+            n_positions=args.max_target_positions + 1,
+            n_ctx=args.max_target_positions,
+            n_embd=args.embed_dim,
+            n_layer=args.num_layers,
+            n_head=args.num_attention_heads,
+            resid_pdrop=args.dropout,
+            embd_pdrop=args.dropout,
+            attn_pdrop=args.attention_dropout,
+            layer_norm_epsilon=1e-6,
+        )
+        self.model = GPT2LMHeadModel(config)
+
+        # set zero embedding for padding symbol
+        self.pad_idx = task.target_dictionary.pad()
+        self.model.transformer.wte.weight.data[self.pad_idx].zero_()
+        self.model.transformer.wpe.weight.data[0].zero_()
+
+    def forward(
+        self,
+        prev_output_tokens,
+        src_lengths=None,
+        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+        encoder_out=None,
+    ):
+        features = self.extract_features(prev_output_tokens, incremental_state)
+        lm_logits = self.model.lm_head(features)
+        return (lm_logits,)
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+    ):
+        if incremental_state:
+            past = self.get_incremental_state("past")
+        else:
+            past = None
+
+        # don't attend to padding symbols
+        attention_mask = prev_output_tokens.ne(self.pad_idx).int()
+
+        # set position ids to exclude padding symbols
+        position_ids = attention_mask * (
+            torch.arange(1, 1 + prev_output_tokens.size(1))
+            .to(prev_output_tokens)
+            .repeat(prev_output_tokens.size(0), 1)
+        )
+
+        outputs = self.model.transformer(
+            input_ids=prev_output_tokens,
+            past=past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        last_hidden_states = outputs[0]
+
+        if incremental_state:
+            self.set_incremental_state(incremental_state, "past", outputs[1])
+
+        return last_hidden_states
+
+    def max_positions(self):
+        return self.model.config.n_positions - 1
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2")
+def default_architecture(args):
+    if getattr(args, "max_target_positions", None) is None:
+        args.max_target_positions = getattr(
+            args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+        )
+    args.embed_dim = getattr(args, "embed_dim", 768)
+    args.num_attention_heads = getattr(args, "num_attention_heads", 12)
+    args.num_layers = getattr(args, "num_layers", 12)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2_medium")
+def hf_gpt2_medium(args):
+    args.embed_dim = getattr(args, "embed_dim", 1024)
+    args.num_attention_heads = getattr(args, "num_attention_heads", 16)
+    args.num_layers = getattr(args, "num_layers", 24)
+    default_architecture(args)
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2_large")
+def hf_gpt2_large(args):
+    args.embed_dim = getattr(args, "embed_dim", 1280)
+    args.num_attention_heads = getattr(args, "num_attention_heads", 20)
+    args.num_layers = getattr(args, "num_layers", 36)
+    default_architecture(args)
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2_xl")
+def hf_gpt2_xl(args):
+    args.embed_dim = getattr(args, "embed_dim", 1600)
+    args.num_attention_heads = getattr(args, "num_attention_heads", 25)
+    args.num_layers = getattr(args, "num_layers", 48)
+    default_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
new file mode 100644
index 0000000000..7950280e30
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
@@ -0,0 +1,1119 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    DynamicConv_scripatable as DynamicConv,
+    FairseqDropout,
+    LayerNorm,
+    LightweightConv,
+    MultiheadAttention,
+    PositionalEmbedding,
+)
+from fairseq.utils import safe_hasattr
+from torch import Tensor
+
+
+@register_model("lightconv")
+class LightConvModel(FairseqEncoderDecoderModel):
+    """
+    LightConv and DynamicConv model from `"Pay Less Attention with Lightweight and Dynamic Convolutions" (Wu, et al, 2019)
+    <https://openreview.net/pdf?id=SkVhlh09tX>`_.
+    To use LightConv please set ``--encoder-conv-type lightweight --decoder-conv-type lightweight``
+    To use DynamicConv please set ``--encoder-conv-type dynamic --decoder-conv-type dynamic``
+
+    Args:
+        encoder (LightConvEncoder): the encoder
+        decoder (LightConvDecoder): the decoder
+
+    The LightConv model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.lightconv_parser
+        :prog:
+    """
+
+    @classmethod
+    def hub_models(cls):
+        # fmt: off
+
+        def moses_subword(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'subword_nmt',
+            }
+
+        return {
+            'lightconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz'),
+            'dynamicconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz'),
+            'lightconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz'),
+            'dynamicconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz'),
+            'lightconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'),
+            'lightconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'),
+            'lightconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz'),
+            'lightconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz'),
+            'dynamicconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz'),
+        }
+        # fmt: on
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--relu-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after ReLU in FFN",
+        )
+        parser.add_argument(
+            "--input-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability of the inputs",
+        )
+        parser.add_argument(
+            "--encoder-embed-path",
+            type=str,
+            metavar="STR",
+            help="path to pre-trained encoder embedding",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-conv-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads or LightConv/DynamicConv heads",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--encoder-learned-pos",
+            action="store_true",
+            help="use learned positional embeddings in the encoder",
+        )
+        parser.add_argument(
+            "--decoder-embed-path",
+            type=str,
+            metavar="STR",
+            help="path to pre-trained decoder embedding",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-conv-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads or LightConv/DynamicConv heads",
+        )
+        parser.add_argument(
+            "--decoder-learned-pos",
+            action="store_true",
+            help="use learned positional embeddings in the decoder",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--share-decoder-input-output-embed",
+            action="store_true",
+            help="share decoder input and output embeddings",
+        )
+        parser.add_argument(
+            "--share-all-embeddings",
+            action="store_true",
+            help="share encoder, decoder and output embeddings"
+            " (requires shared dictionary and embed dim)",
+        )
+        parser.add_argument(
+            "--adaptive-softmax-cutoff",
+            metavar="EXPR",
+            help="comma separated list of adaptive softmax cutoff points. "
+            "Must be used with adaptive_loss criterion",
+        ),
+        parser.add_argument(
+            "--adaptive-softmax-dropout",
+            type=float,
+            metavar="D",
+            help="sets adaptive softmax dropout for the tail projections",
+        )
+
+        """LightConv and DynamicConv arguments"""
+        parser.add_argument(
+            "--encoder-kernel-size-list",
+            type=lambda x: utils.eval_str_list(x, int),
+            help='list of kernel size (default: "[3,7,15,31,31,31,31]")',
+        )
+        parser.add_argument(
+            "--decoder-kernel-size-list",
+            type=lambda x: utils.eval_str_list(x, int),
+            help='list of kernel size (default: "[3,7,15,31,31,31]")',
+        )
+        parser.add_argument(
+            "--encoder-glu", type=utils.eval_bool, help="glu after in proj"
+        )
+        parser.add_argument(
+            "--decoder-glu", type=utils.eval_bool, help="glu after in proj"
+        )
+        parser.add_argument(
+            "--encoder-conv-type",
+            default="dynamic",
+            type=str,
+            choices=["dynamic", "lightweight"],
+            help="type of convolution",
+        )
+        parser.add_argument(
+            "--decoder-conv-type",
+            default="dynamic",
+            type=str,
+            choices=["dynamic", "lightweight"],
+            help="type of convolution",
+        )
+        parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
+        parser.add_argument(
+            "--weight-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for conv weights",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not safe_hasattr(args, "max_source_positions"):
+            args.max_source_positions = 1024
+        if not safe_hasattr(args, "max_target_positions"):
+            args.max_target_positions = 1024
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise RuntimeError(
+                    "--share-all-embeddings requires a joined dictionary"
+                )
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise RuntimeError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise RuntimeError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+
+        encoder = LightConvEncoder(args, src_dict, encoder_embed_tokens)
+        decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens)
+        return LightConvModel(encoder, decoder)
+
+    def forward(
+        self,
+        src_tokens: Tensor,
+        src_lengths: Tensor,
+        prev_output_tokens: Tensor,
+    ):
+        """
+        (The forward method inherited from the base class has a **kwargs
+        argument in its input, which is not supported in torchscript. This
+        method overwrites the forward method definition without **kwargs.)
+
+        Run the forward pass for an encoder-decoder model.
+
+        First feed a batch of source tokens through the encoder. Then, feed the
+        encoder output and previous decoder outputs (i.e., teacher forcing) to
+        the decoder to produce the next outputs::
+
+            encoder_out = self.encoder(src_tokens, src_lengths)
+            return self.decoder(prev_output_tokens, encoder_out)
+
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        encoder_out = self.encoder(src_tokens, src_lengths)
+        decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out)
+        return decoder_out
+
+
+class LightConvEncoder(FairseqEncoder):
+    """
+    LightConv encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`LightConvEncoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = args.max_source_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=args.encoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                LightConvEncoderLayer(
+                    args, kernel_size=args.encoder_kernel_size_list[i]
+                )
+                for i in range(args.encoder_layers)
+            ]
+        )
+        self.register_buffer("version", torch.Tensor([2]))
+        self.normalize = args.encoder_normalize_before
+        if self.normalize:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(
+        self, src_tokens: Tensor, src_lengths: Optional[Tensor] = None
+    ) -> Dict[str, List[Tensor]]:
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+        """
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x += self.embed_positions(src_tokens)
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)  # B x T
+        if not encoder_padding_mask.any():
+            encoder_mask = None
+        else:
+            encoder_mask = encoder_padding_mask
+
+        # encoder layers
+        for layer in self.layers:
+            x = layer(x, encoder_mask)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        output_dict: Dict[str, List[Tensor]] = {}
+        if src_lengths is not None:
+            output_dict["src_lengths"] = [src_lengths]
+        output_dict["encoder_out"] = [x]  # T x B x C
+        if encoder_mask is not None:
+            output_dict["encoder_padding_mask"] = [encoder_mask]  # B x T
+
+        return output_dict
+
+    @torch.jit.export
+    def reorder_encoder_out(
+        self, encoder_out: Dict[str, List[Tensor]], new_order: Tensor
+    ):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        if len(encoder_out["encoder_out"]) == 0:
+            encoder = []
+        else:
+            encoder = [encoder_out["encoder_out"][0].index_select(1, new_order)]
+        output_dict = {"encoder_out": encoder}
+
+        if ("encoder_padding_mask" not in encoder_out) or (
+            len(encoder_out["encoder_padding_mask"]) == 0
+        ):
+            encoder_padding_mask = []
+        else:
+            encoder_padding_mask = [
+                encoder_out["encoder_padding_mask"][0].index_select(0, new_order)
+            ]
+        output_dict["encoder_padding_mask"] = encoder_padding_mask
+        return output_dict
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions)
+
+
+class LightConvDecoder(FairseqIncrementalDecoder):
+    """
+    LightConv decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`LightConvDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
+            Default: ``False``
+    """
+
+    def __init__(
+        self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True
+    ):
+        super().__init__(dictionary)
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        output_embed_dim = args.decoder_output_dim
+
+        padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                args.max_target_positions,
+                embed_dim,
+                padding_idx,
+                learned=args.decoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                LightConvDecoderLayer(
+                    args,
+                    no_encoder_attn,
+                    kernel_size=args.decoder_kernel_size_list[i],
+                    dictionary=dictionary,
+                )
+                for i in range(args.decoder_layers)
+            ]
+        )
+
+        self.adaptive_softmax = None
+        self.output_projection = None
+
+        self.project_out_dim = (
+            Linear(embed_dim, output_embed_dim, bias=False)
+            if embed_dim != output_embed_dim and not args.tie_adaptive_weights
+            else None
+        )
+
+        if args.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                output_embed_dim,
+                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
+                factor=args.adaptive_softmax_factor,
+                tie_proj=args.tie_adaptive_proj,
+            )
+        elif self.share_input_output_embed:
+            self.output_projection = nn.Linear(
+                self.embed_tokens.weight.shape[1],
+                self.embed_tokens.weight.shape[0],
+                bias=False,
+            )
+            self.output_projection.weight = self.embed_tokens.weight
+
+        else:
+            self.output_projection = nn.Linear(
+                output_embed_dim, len(dictionary), bias=False
+            )
+            nn.init.normal_(
+                self.output_projection.weight, mean=0, std=output_embed_dim**-0.5
+            )
+        self.register_buffer("version", torch.Tensor([2]))
+        self.normalize = args.decoder_normalize_before and final_norm
+        if self.normalize:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(
+        self,
+        prev_output_tokens: Tensor,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        src_lengths: Optional[Any] = None,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the last decoder layer's output of shape `(batch, tgt_len,
+                  vocab)`
+                - the last decoder layer's attention weights of shape `(batch,
+                  tgt_len, src_len)`
+        """
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens,
+                incremental_state=incremental_state,
+            )
+            if self.embed_positions is not None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens.contiguous())
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+
+        inner_states: List[Optional[Tensor]] = [x]
+
+        # decoder layers
+        attn: Optional[Tensor] = None
+        for layer in self.layers:
+            encoder: Optional[Tensor] = None
+            encoder_padding_mask: Optional[Tensor] = None
+            if encoder_out is not None:
+                if len(encoder_out["encoder_out"]) > 0:
+                    encoder = encoder_out["encoder_out"][0]
+                if (
+                    "encoder_padding_mask" in encoder_out
+                    and len(encoder_out["encoder_padding_mask"]) > 0
+                ):
+                    encoder_padding_mask = encoder_out["encoder_padding_mask"][0]
+            x, attn = layer(
+                x,
+                encoder,
+                encoder_padding_mask,
+                incremental_state,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            x = self.output_projection(x)
+
+        return x, {"attn": [attn], "inner_states": inner_states}
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        if self._future_mask.size(0) < dim:
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+
+class LightConvEncoderLayer(nn.Module):
+    """Encoder layer block.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        kernel_size: kernel size of the convolution
+    """
+
+    def __init__(self, args, kernel_size=0):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.conv_dim = args.encoder_conv_dim
+        padding_l = (
+            kernel_size // 2
+            if kernel_size % 2 == 1
+            else ((kernel_size - 1) // 2, kernel_size // 2)
+        )
+
+        if args.encoder_glu:
+            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
+            self.act = nn.GLU()
+        else:
+            self.linear1 = Linear(self.embed_dim, self.conv_dim)
+            self.act = None
+        if args.encoder_conv_type == "lightweight":
+            self.conv = LightweightConv(
+                self.conv_dim,
+                kernel_size,
+                padding_l=padding_l,
+                weight_softmax=args.weight_softmax,
+                num_heads=args.encoder_attention_heads,
+                weight_dropout=args.weight_dropout,
+            )
+        elif args.encoder_conv_type == "dynamic":
+            self.conv = DynamicConv(
+                self.conv_dim,
+                kernel_size,
+                padding_l=padding_l,
+                weight_softmax=args.weight_softmax,
+                num_heads=args.encoder_attention_heads,
+                weight_dropout=args.weight_dropout,
+            )
+        else:
+            raise NotImplementedError
+        self.linear2 = Linear(self.conv_dim, self.embed_dim)
+
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.relu_dropout_module = FairseqDropout(
+            args.relu_dropout, module_name=self.__class__.__name__
+        )
+        self.input_dropout_module = FairseqDropout(
+            args.input_dropout, module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
+        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
+        self.layer_norm1 = LayerNorm(self.embed_dim)
+        self.layer_norm2 = LayerNorm(self.embed_dim)
+
+    def forward(self, x, encoder_padding_mask: Optional[Tensor] = None) -> Tensor:
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+
+        Returns:
+            encoded output of shape `(batch, src_len, embed_dim)`
+        """
+        residual = x
+        normalize = self.maybe_layer_norm(before=True)
+        if normalize:
+            x = self.layer_norm1(x)
+        x = self.input_dropout_module(x)
+        x = self.linear1(x)
+        if self.act is not None:
+            x = self.act(x)
+        if encoder_padding_mask is not None:
+            x = x.masked_fill(encoder_padding_mask.transpose(0, 1).unsqueeze(2), 0)
+        x = self.conv(x)
+        x = self.linear2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        normalize = self.maybe_layer_norm(after=True)
+        if normalize:
+            x = self.layer_norm1(x)
+
+        residual = x
+        normalize = self.maybe_layer_norm(before=True)
+        if normalize:
+            x = self.layer_norm2(x)
+        x = F.relu(self.fc1(x))
+        x = self.relu_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        normalize = self.maybe_layer_norm(after=True)
+        if normalize:
+            x = self.layer_norm2(x)
+        return x
+
+    def maybe_layer_norm(self, before: bool = False, after: bool = False):
+        assert before ^ after, "Incorrect arguments"
+        return after ^ self.normalize_before
+
+    def extra_repr(self):
+        return (
+            "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format(
+                self.dropout_module.p,
+                self.relu_dropout_module.p,
+                self.input_dropout_module.p,
+                self.normalize_before,
+            )
+        )
+
+
+class LightConvDecoderLayer(nn.Module):
+    """Decoder layer block.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
+            Default: ``False``
+        kernel_size: kernel size of the convolution
+    """
+
+    def __init__(self, args, no_encoder_attn=False, kernel_size=0, dictionary=None):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.conv_dim = args.decoder_conv_dim
+        if args.decoder_glu:
+            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
+            self.act = nn.GLU()
+        else:
+            self.linear1 = Linear(self.embed_dim, self.conv_dim)
+            self.act = None
+        if args.decoder_conv_type == "lightweight":
+            self.conv = LightweightConv(
+                self.conv_dim,
+                kernel_size,
+                padding_l=kernel_size - 1,
+                weight_softmax=args.weight_softmax,
+                num_heads=args.decoder_attention_heads,
+                weight_dropout=args.weight_dropout,
+            )
+        elif args.decoder_conv_type == "dynamic":
+            self.conv = DynamicConv(
+                self.conv_dim,
+                kernel_size,
+                padding_l=kernel_size - 1,
+                weight_softmax=args.weight_softmax,
+                num_heads=args.decoder_attention_heads,
+                weight_dropout=args.weight_dropout,
+            )
+        else:
+            raise NotImplementedError
+        self.linear2 = Linear(self.conv_dim, self.embed_dim)
+
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.relu_dropout_module = FairseqDropout(
+            args.relu_dropout, module_name=self.__class__.__name__
+        )
+        self.input_dropout_module = FairseqDropout(
+            args.input_dropout, module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.decoder_normalize_before
+
+        self.conv_layer_norm = LayerNorm(self.embed_dim)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = MultiheadAttention(
+                self.embed_dim,
+                args.decoder_attention_heads,
+                dropout=args.attention_dropout,
+                encoder_decoder_attention=True,
+                dictionary=dictionary,
+            )
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+
+        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
+        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
+
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+        self.need_attn = True
+
+    def forward(
+        self,
+        x: Tensor,
+        encoder_out: Optional[Tensor],
+        encoder_padding_mask: Optional[Tensor],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        prev_conv_state: Optional[Tensor] = None,
+        prev_attn_state: Optional[Tuple[Tensor, Tensor]] = None,
+        conv_mask: Optional[Tensor] = None,
+        conv_padding_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+
+        Returns:
+            encoded output of shape `(batch, src_len, embed_dim)`
+        """
+        residual = x
+        normalize = self.maybe_layer_norm(before=True)
+        if normalize:
+            x = self.conv_layer_norm(x)
+        if prev_conv_state is not None:
+            self.conv._set_input_buffer(incremental_state, prev_conv_state)
+        x = self.input_dropout_module(x)
+        x = self.linear1(x)
+        if self.act is not None:
+            x = self.act(x)
+        x = self.conv(x, incremental_state=incremental_state)
+        x = self.linear2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        normalize = self.maybe_layer_norm(after=True)
+        if normalize:
+            x = self.conv_layer_norm(x)
+
+        attn: Optional[Tensor] = None
+        if self.encoder_attn is not None:
+            residual = x
+            normalize = self.maybe_layer_norm(before=True)
+            if normalize:
+                x = self.encoder_attn_layer_norm(x)
+
+            if prev_attn_state is not None:
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_attn_state[0],
+                    "prev_value": prev_attn_state[1],
+                }
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=(not self.training and self.need_attn),
+            )
+            x = self.dropout_module(x)
+            x = residual + x
+            normalize = self.maybe_layer_norm(after=True)
+            if normalize:
+                x = self.encoder_attn_layer_norm(x)
+
+        residual = x
+        normalize = self.maybe_layer_norm(before=True)
+        if normalize:
+            x = self.final_layer_norm(x)
+        x = F.relu(self.fc1(x))
+        x = self.relu_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        normalize = self.maybe_layer_norm(after=True)
+        if normalize:
+            x = self.final_layer_norm(x)
+        return x, attn
+
+    def maybe_layer_norm(self, before: bool = False, after: bool = False):
+        assert before ^ after, "Incorrect usage"
+        return after ^ self.normalize_before
+
+    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+        self.need_attn = need_attn
+
+    def extra_repr(self):
+        return (
+            "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format(
+                self.dropout_module.p,
+                self.relu_dropout_module.p,
+                self.input_dropout_module.p,
+                self.normalize_before,
+            )
+        )
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
+
+
+@register_model_architecture("lightconv", "lightconv")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 7)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.encoder_conv_dim = getattr(args, "encoder_conv_dim", args.encoder_embed_dim)
+    args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim)
+
+    args.encoder_kernel_size_list = getattr(
+        args, "encoder_kernel_size_list", [3, 7, 15, 31, 31, 31, 31]
+    )
+    args.decoder_kernel_size_list = getattr(
+        args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31]
+    )
+    if len(args.encoder_kernel_size_list) == 1:
+        args.encoder_kernel_size_list = (
+            args.encoder_kernel_size_list * args.encoder_layers
+        )
+    if len(args.decoder_kernel_size_list) == 1:
+        args.decoder_kernel_size_list = (
+            args.decoder_kernel_size_list * args.decoder_layers
+        )
+    assert (
+        len(args.encoder_kernel_size_list) == args.encoder_layers
+    ), "encoder_kernel_size_list doesn't match encoder_layers"
+    assert (
+        len(args.decoder_kernel_size_list) == args.decoder_layers
+    ), "decoder_kernel_size_list doesn't match decoder_layers"
+    args.encoder_glu = getattr(args, "encoder_glu", True)
+    args.decoder_glu = getattr(args, "decoder_glu", True)
+    args.input_dropout = getattr(args, "input_dropout", 0.1)
+    args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout)
+
+
+@register_model_architecture("lightconv", "lightconv_iwslt_de_en")
+def lightconv_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 7)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.weight_dropout = getattr(args, "weight_dropout", 0.1)
+    args.encoder_glu = getattr(args, "encoder_glu", False)
+    args.decoder_glu = getattr(args, "decoder_glu", False)
+    args.input_dropout = getattr(args, "input_dropout", 0.0)
+    base_architecture(args)
+
+
+@register_model_architecture("lightconv", "lightconv_wmt_en_de")
+def lightconv_wmt_en_de(args):
+    base_architecture(args)
+
+
+@register_model_architecture("lightconv", "lightconv_wmt_en_de_big")
+def lightconv_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    base_architecture(args)
+
+
+@register_model_architecture("lightconv", "lightconv_wmt_en_fr_big")
+def lightconv_wmt_en_fr_big(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    lightconv_wmt_en_de_big(args)
+
+
+@register_model_architecture("lightconv", "lightconv_wmt_zh_en_big")
+def lightconv_wmt_zh_en_big(args):
+    args.dropout = getattr(args, "dropout", 0.2)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.2)
+    args.weight_dropout = getattr(args, "weight_dropout", 0.2)
+    lightconv_wmt_en_de_big(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
new file mode 100644
index 0000000000..1d9efc4e42
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
@@ -0,0 +1,306 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.lightconv import Embedding, LightConvDecoder
+from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
+
+
+@register_model("lightconv_lm")
+class LightConvLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--dropout",
+            default=0.1,
+            type=float,
+            metavar="D",
+            help="dropout probability",
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            default=0.0,
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--relu-dropout",
+            default=0.0,
+            type=float,
+            metavar="D",
+            help="dropout probability after ReLU in FFN",
+        )
+        parser.add_argument(
+            "--input-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability of the inputs",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-output-dim",
+            type=int,
+            metavar="N",
+            help="decoder output dimension",
+        )
+        parser.add_argument(
+            "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension"
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads or LightConv/DynamicConv heads",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            default=False,
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--adaptive-softmax-cutoff",
+            metavar="EXPR",
+            help="comma separated list of adaptive softmax cutoff points. "
+            "Must be used with adaptive_loss criterion",
+        )
+        parser.add_argument(
+            "--adaptive-softmax-dropout",
+            type=float,
+            metavar="D",
+            help="sets adaptive softmax dropout for the tail projections",
+        )
+        parser.add_argument(
+            "--adaptive-softmax-factor",
+            type=float,
+            metavar="N",
+            help="adaptive input factor",
+        )
+        parser.add_argument(
+            "--no-token-positional-embeddings",
+            default=False,
+            action="store_true",
+            help="if set, disables positional embeddings (outside self attention)",
+        )
+        parser.add_argument(
+            "--share-decoder-input-output-embed",
+            default=False,
+            action="store_true",
+            help="share decoder input and output embeddings",
+        )
+        parser.add_argument(
+            "--character-embeddings",
+            default=False,
+            action="store_true",
+            help="if set, uses character embedding convolutions to produce token embeddings",
+        )
+        parser.add_argument(
+            "--character-filters",
+            type=str,
+            metavar="LIST",
+            default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
+            help="size of character embeddings",
+        )
+        parser.add_argument(
+            "--character-embedding-dim",
+            type=int,
+            metavar="N",
+            default=4,
+            help="size of character embeddings",
+        )
+        parser.add_argument(
+            "--char-embedder-highway-layers",
+            type=int,
+            metavar="N",
+            default=2,
+            help="number of highway layers for character token embeddder",
+        )
+        parser.add_argument(
+            "--adaptive-input",
+            default=False,
+            action="store_true",
+            help="if set, uses adaptive input",
+        )
+        parser.add_argument(
+            "--adaptive-input-factor",
+            type=float,
+            metavar="N",
+            help="adaptive input factor",
+        )
+        parser.add_argument(
+            "--adaptive-input-cutoff",
+            metavar="EXPR",
+            help="comma separated list of adaptive input cutoff points.",
+        )
+        parser.add_argument(
+            "--tie-adaptive-weights",
+            action="store_true",
+            help="if set, ties the weights of adaptive softmax and adaptive input",
+        )
+        parser.add_argument(
+            "--tie-adaptive-proj",
+            action="store_true",
+            help="if set, ties the projection weights of adaptive softmax and adaptive input",
+        )
+        parser.add_argument(
+            "--decoder-learned-pos",
+            action="store_true",
+            help="use learned positional embeddings in the decoder",
+        )
+
+        """LightConv and DynamicConv arguments"""
+        parser.add_argument(
+            "--decoder-kernel-size-list",
+            type=lambda x: utils.eval_str_list(x, int),
+            help='list of kernel size (default: "[3,7,15,31,31,31]")',
+        )
+        parser.add_argument(
+            "--decoder-glu", type=utils.eval_bool, help="glu after in proj"
+        )
+        parser.add_argument(
+            "--decoder-conv-type",
+            default="dynamic",
+            type=str,
+            choices=["dynamic", "lightweight"],
+            help="type of convolution",
+        )
+        parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
+        parser.add_argument(
+            "--weight-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for conv weights",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if getattr(args, "max_source_positions", None) is None:
+            args.max_source_positions = args.tokens_per_sample
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = args.tokens_per_sample
+
+        if args.character_embeddings:
+            embed_tokens = CharacterTokenEmbedder(
+                task.dictionary,
+                eval(args.character_filters),
+                args.character_embedding_dim,
+                args.decoder_embed_dim,
+                args.char_embedder_highway_layers,
+            )
+        elif args.adaptive_input:
+            embed_tokens = AdaptiveInput(
+                len(task.dictionary),
+                task.dictionary.pad(),
+                args.decoder_input_dim,
+                args.adaptive_input_factor,
+                args.decoder_embed_dim,
+                utils.eval_str_list(args.adaptive_input_cutoff, type=int),
+            )
+        else:
+            embed_tokens = Embedding(
+                len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()
+            )
+
+        if args.tie_adaptive_weights:
+            assert args.adaptive_input
+            assert args.adaptive_input_factor == args.adaptive_softmax_factor
+            assert (
+                args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
+            ), "{} != {}".format(
+                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
+            )
+            assert args.decoder_input_dim == args.decoder_output_dim
+
+        decoder = LightConvDecoder(
+            args,
+            task.output_dictionary,
+            embed_tokens,
+            no_encoder_attn=True,
+            final_norm=False,
+        )
+        return LightConvLanguageModel(decoder)
+
+
+@register_model_architecture("lightconv_lm", "lightconv_lm")
+def base_lm_architecture(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+
+    args.character_embeddings = getattr(args, "character_embeddings", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim)
+
+    # The model training is not stable without this
+    args.decoder_normalize_before = True
+
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
+    args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
+
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
+
+    args.decoder_kernel_size_list = getattr(
+        args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31]
+    )
+    if len(args.decoder_kernel_size_list) == 1:
+        args.decoder_kernel_size_list = (
+            args.decoder_kernel_size_list * args.decoder_layers
+        )
+    assert (
+        len(args.decoder_kernel_size_list) == args.decoder_layers
+    ), "decoder_kernel_size_list doesn't match decoder_layers"
+    args.decoder_glu = getattr(args, "decoder_glu", True)
+    args.input_dropout = getattr(args, "input_dropout", 0.1)
+    args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout)
+
+
+@register_model_architecture("lightconv_lm", "lightconv_lm_gbw")
+def lightconv_lm_gbw(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    base_lm_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
new file mode 100644
index 0000000000..8a29156270
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
@@ -0,0 +1,755 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import AdaptiveSoftmax, FairseqDropout
+from torch import Tensor
+
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1e5
+DEFAULT_MAX_TARGET_POSITIONS = 1e5
+
+
+@register_model("lstm")
+class LSTMModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-freeze-embed', action='store_true',
+                            help='freeze encoder embeddings')
+        parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
+                            help='encoder hidden size')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='number of encoder layers')
+        parser.add_argument('--encoder-bidirectional', action='store_true',
+                            help='make all layers of encoder bidirectional')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-freeze-embed', action='store_true',
+                            help='freeze decoder embeddings')
+        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
+                            help='decoder hidden size')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='number of decoder layers')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
+                            help='decoder attention')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--share-decoder-input-output-embed', default=False,
+                            action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', default=False, action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+
+        # Granular dropout settings (if not specified these default to --dropout)
+        parser.add_argument('--encoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for encoder input embedding')
+        parser.add_argument('--encoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for encoder output')
+        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for decoder input embedding')
+        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for decoder output')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+
+        if args.encoder_layers != args.decoder_layers:
+            raise ValueError("--encoder-layers must match --decoder-layers")
+
+        max_source_positions = getattr(
+            args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS
+        )
+        max_target_positions = getattr(
+            args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS
+        )
+
+        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+            embed_dict = utils.parse_embedding(embed_path)
+            utils.print_embed_overlap(embed_dict, dictionary)
+            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
+
+        if args.encoder_embed_path:
+            pretrained_encoder_embed = load_pretrained_embedding_from_file(
+                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim
+            )
+        else:
+            num_embeddings = len(task.source_dictionary)
+            pretrained_encoder_embed = Embedding(
+                num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
+            )
+
+        if args.share_all_embeddings:
+            # double check all parameters combinations are valid
+            if task.source_dictionary != task.target_dictionary:
+                raise ValueError("--share-all-embeddings requires a joint dictionary")
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embed not compatible with --decoder-embed-path"
+                )
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to "
+                    "match --decoder-embed-dim"
+                )
+            pretrained_decoder_embed = pretrained_encoder_embed
+            args.share_decoder_input_output_embed = True
+        else:
+            # separate decoder input embeddings
+            pretrained_decoder_embed = None
+            if args.decoder_embed_path:
+                pretrained_decoder_embed = load_pretrained_embedding_from_file(
+                    args.decoder_embed_path,
+                    task.target_dictionary,
+                    args.decoder_embed_dim,
+                )
+        # one last double check of parameter combinations
+        if args.share_decoder_input_output_embed and (
+            args.decoder_embed_dim != args.decoder_out_embed_dim
+        ):
+            raise ValueError(
+                "--share-decoder-input-output-embeddings requires "
+                "--decoder-embed-dim to match --decoder-out-embed-dim"
+            )
+
+        if args.encoder_freeze_embed:
+            pretrained_encoder_embed.weight.requires_grad = False
+        if args.decoder_freeze_embed:
+            pretrained_decoder_embed.weight.requires_grad = False
+
+        encoder = LSTMEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            hidden_size=args.encoder_hidden_size,
+            num_layers=args.encoder_layers,
+            dropout_in=args.encoder_dropout_in,
+            dropout_out=args.encoder_dropout_out,
+            bidirectional=args.encoder_bidirectional,
+            pretrained_embed=pretrained_encoder_embed,
+            max_source_positions=max_source_positions,
+        )
+        decoder = LSTMDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            hidden_size=args.decoder_hidden_size,
+            out_embed_dim=args.decoder_out_embed_dim,
+            num_layers=args.decoder_layers,
+            dropout_in=args.decoder_dropout_in,
+            dropout_out=args.decoder_dropout_out,
+            attention=utils.eval_bool(args.decoder_attention),
+            encoder_output_units=encoder.output_units,
+            pretrained_embed=pretrained_decoder_embed,
+            share_input_output_embed=args.share_decoder_input_output_embed,
+            adaptive_softmax_cutoff=(
+                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == "adaptive_loss"
+                else None
+            ),
+            max_target_positions=max_target_positions,
+            residuals=False,
+        )
+        return cls(encoder, decoder)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths)
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+        )
+        return decoder_out
+
+
+class LSTMEncoder(FairseqEncoder):
+    """LSTM encoder."""
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        hidden_size=512,
+        num_layers=1,
+        dropout_in=0.1,
+        dropout_out=0.1,
+        bidirectional=False,
+        left_pad=True,
+        pretrained_embed=None,
+        padding_idx=None,
+        max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS,
+    ):
+        super().__init__(dictionary)
+        self.num_layers = num_layers
+        self.dropout_in_module = FairseqDropout(
+            dropout_in * 1.0, module_name=self.__class__.__name__
+        )
+        self.dropout_out_module = FairseqDropout(
+            dropout_out * 1.0, module_name=self.__class__.__name__
+        )
+        self.bidirectional = bidirectional
+        self.hidden_size = hidden_size
+        self.max_source_positions = max_source_positions
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+
+        self.lstm = LSTM(
+            input_size=embed_dim,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=self.dropout_out_module.p if num_layers > 1 else 0.0,
+            bidirectional=bidirectional,
+        )
+        self.left_pad = left_pad
+
+        self.output_units = hidden_size
+        if bidirectional:
+            self.output_units *= 2
+
+    def forward(
+        self,
+        src_tokens: Tensor,
+        src_lengths: Tensor,
+        enforce_sorted: bool = True,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of
+                shape `(batch, src_len)`
+            src_lengths (LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            enforce_sorted (bool, optional): if True, `src_tokens` is
+                expected to contain sequences sorted by length in a
+                decreasing order. If False, this condition is not
+                required. Default: True.
+        """
+        if self.left_pad:
+            # nn.utils.rnn.pack_padded_sequence requires right-padding;
+            # convert left-padding to right-padding
+            src_tokens = utils.convert_padding_direction(
+                src_tokens,
+                torch.zeros_like(src_tokens).fill_(self.padding_idx),
+                left_to_right=True,
+            )
+
+        bsz, seqlen = src_tokens.size()
+
+        # embed tokens
+        x = self.embed_tokens(src_tokens)
+        x = self.dropout_in_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # pack embedded source tokens into a PackedSequence
+        packed_x = nn.utils.rnn.pack_padded_sequence(
+            x, src_lengths.cpu(), enforce_sorted=enforce_sorted
+        )
+
+        # apply LSTM
+        if self.bidirectional:
+            state_size = 2 * self.num_layers, bsz, self.hidden_size
+        else:
+            state_size = self.num_layers, bsz, self.hidden_size
+        h0 = x.new_zeros(*state_size)
+        c0 = x.new_zeros(*state_size)
+        packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
+
+        # unpack outputs and apply dropout
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            packed_outs, padding_value=self.padding_idx * 1.0
+        )
+        x = self.dropout_out_module(x)
+        assert list(x.size()) == [seqlen, bsz, self.output_units]
+
+        if self.bidirectional:
+            final_hiddens = self.combine_bidir(final_hiddens, bsz)
+            final_cells = self.combine_bidir(final_cells, bsz)
+
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
+
+        return tuple(
+            (
+                x,  # seq_len x batch x hidden
+                final_hiddens,  # num_layers x batch x num_directions*hidden
+                final_cells,  # num_layers x batch x num_directions*hidden
+                encoder_padding_mask,  # seq_len x batch
+            )
+        )
+
+    def combine_bidir(self, outs, bsz: int):
+        out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous()
+        return out.view(self.num_layers, bsz, -1)
+
+    def reorder_encoder_out(
+        self, encoder_out: Tuple[Tensor, Tensor, Tensor, Tensor], new_order
+    ):
+        return tuple(
+            (
+                encoder_out[0].index_select(1, new_order),
+                encoder_out[1].index_select(1, new_order),
+                encoder_out[2].index_select(1, new_order),
+                encoder_out[3].index_select(1, new_order),
+            )
+        )
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.max_source_positions
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False):
+        super().__init__()
+
+        self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias)
+        self.output_proj = Linear(
+            input_embed_dim + source_embed_dim, output_embed_dim, bias=bias
+        )
+
+    def forward(self, input, source_hids, encoder_padding_mask):
+        # input: bsz x input_embed_dim
+        # source_hids: srclen x bsz x source_embed_dim
+
+        # x: bsz x source_embed_dim
+        x = self.input_proj(input)
+
+        # compute attention
+        attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
+
+        # don't attend over padding
+        if encoder_padding_mask is not None:
+            attn_scores = (
+                attn_scores.float()
+                .masked_fill_(encoder_padding_mask, float("-inf"))
+                .type_as(attn_scores)
+            )  # FP16 support: cast to float and back
+
+        attn_scores = F.softmax(attn_scores, dim=0)  # srclen x bsz
+
+        # sum weighted sources
+        x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
+
+        x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1)))
+        return x, attn_scores
+
+
+class LSTMDecoder(FairseqIncrementalDecoder):
+    """LSTM decoder."""
+
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        hidden_size=512,
+        out_embed_dim=512,
+        num_layers=1,
+        dropout_in=0.1,
+        dropout_out=0.1,
+        attention=True,
+        encoder_output_units=512,
+        pretrained_embed=None,
+        share_input_output_embed=False,
+        adaptive_softmax_cutoff=None,
+        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
+        residuals=False,
+    ):
+        super().__init__(dictionary)
+        self.dropout_in_module = FairseqDropout(
+            dropout_in * 1.0, module_name=self.__class__.__name__
+        )
+        self.dropout_out_module = FairseqDropout(
+            dropout_out * 1.0, module_name=self.__class__.__name__
+        )
+        self.hidden_size = hidden_size
+        self.share_input_output_embed = share_input_output_embed
+        self.need_attn = True
+        self.max_target_positions = max_target_positions
+        self.residuals = residuals
+        self.num_layers = num_layers
+
+        self.adaptive_softmax = None
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+
+        self.encoder_output_units = encoder_output_units
+        if encoder_output_units != hidden_size and encoder_output_units != 0:
+            self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size)
+            self.encoder_cell_proj = Linear(encoder_output_units, hidden_size)
+        else:
+            self.encoder_hidden_proj = self.encoder_cell_proj = None
+
+        # disable input feeding if there is no encoder
+        # input feeding is described in arxiv.org/abs/1508.04025
+        input_feed_size = 0 if encoder_output_units == 0 else hidden_size
+        self.layers = nn.ModuleList(
+            [
+                LSTMCell(
+                    input_size=input_feed_size + embed_dim
+                    if layer == 0
+                    else hidden_size,
+                    hidden_size=hidden_size,
+                )
+                for layer in range(num_layers)
+            ]
+        )
+
+        if attention:
+            # TODO make bias configurable
+            self.attention = AttentionLayer(
+                hidden_size, encoder_output_units, hidden_size, bias=False
+            )
+        else:
+            self.attention = None
+
+        if hidden_size != out_embed_dim:
+            self.additional_fc = Linear(hidden_size, out_embed_dim)
+
+        if adaptive_softmax_cutoff is not None:
+            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
+            self.adaptive_softmax = AdaptiveSoftmax(
+                num_embeddings,
+                hidden_size,
+                adaptive_softmax_cutoff,
+                dropout=dropout_out,
+            )
+        elif not self.share_input_output_embed:
+            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        src_lengths: Optional[Tensor] = None,
+    ):
+        x, attn_scores = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state
+        )
+        return self.output_layer(x), attn_scores
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+        """
+        # get outputs from encoder
+        if encoder_out is not None:
+            encoder_outs = encoder_out[0]
+            encoder_hiddens = encoder_out[1]
+            encoder_cells = encoder_out[2]
+            encoder_padding_mask = encoder_out[3]
+        else:
+            encoder_outs = torch.empty(0)
+            encoder_hiddens = torch.empty(0)
+            encoder_cells = torch.empty(0)
+            encoder_padding_mask = torch.empty(0)
+        srclen = encoder_outs.size(0)
+
+        if incremental_state is not None and len(incremental_state) > 0:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+
+        bsz, seqlen = prev_output_tokens.size()
+
+        # embed tokens
+        x = self.embed_tokens(prev_output_tokens)
+        x = self.dropout_in_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # initialize previous states (or get from cache during incremental generation)
+        if incremental_state is not None and len(incremental_state) > 0:
+            prev_hiddens, prev_cells, input_feed = self.get_cached_state(
+                incremental_state
+            )
+        elif encoder_out is not None:
+            # setup recurrent cells
+            prev_hiddens = [encoder_hiddens[i] for i in range(self.num_layers)]
+            prev_cells = [encoder_cells[i] for i in range(self.num_layers)]
+            if self.encoder_hidden_proj is not None:
+                prev_hiddens = [self.encoder_hidden_proj(y) for y in prev_hiddens]
+                prev_cells = [self.encoder_cell_proj(y) for y in prev_cells]
+            input_feed = x.new_zeros(bsz, self.hidden_size)
+        else:
+            # setup zero cells, since there is no encoder
+            zero_state = x.new_zeros(bsz, self.hidden_size)
+            prev_hiddens = [zero_state for i in range(self.num_layers)]
+            prev_cells = [zero_state for i in range(self.num_layers)]
+            input_feed = None
+
+        assert (
+            srclen > 0 or self.attention is None
+        ), "attention is not supported if there are no encoder outputs"
+        attn_scores: Optional[Tensor] = (
+            x.new_zeros(srclen, seqlen, bsz) if self.attention is not None else None
+        )
+        outs = []
+        for j in range(seqlen):
+            # input feeding: concatenate context vector from previous time step
+            if input_feed is not None:
+                input = torch.cat((x[j, :, :], input_feed), dim=1)
+            else:
+                input = x[j]
+
+            for i, rnn in enumerate(self.layers):
+                # recurrent cell
+                hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
+
+                # hidden state becomes the input to the next layer
+                input = self.dropout_out_module(hidden)
+                if self.residuals:
+                    input = input + prev_hiddens[i]
+
+                # save state for next time step
+                prev_hiddens[i] = hidden
+                prev_cells[i] = cell
+
+            # apply attention using the last layer's hidden state
+            if self.attention is not None:
+                assert attn_scores is not None
+                out, attn_scores[:, j, :] = self.attention(
+                    hidden, encoder_outs, encoder_padding_mask
+                )
+            else:
+                out = hidden
+            out = self.dropout_out_module(out)
+
+            # input feeding
+            if input_feed is not None:
+                input_feed = out
+
+            # save final output
+            outs.append(out)
+
+        # Stack all the necessary tensors together and store
+        prev_hiddens_tensor = torch.stack(prev_hiddens)
+        prev_cells_tensor = torch.stack(prev_cells)
+        cache_state = torch.jit.annotate(
+            Dict[str, Optional[Tensor]],
+            {
+                "prev_hiddens": prev_hiddens_tensor,
+                "prev_cells": prev_cells_tensor,
+                "input_feed": input_feed,
+            },
+        )
+        self.set_incremental_state(incremental_state, "cached_state", cache_state)
+
+        # collect outputs across time steps
+        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        if hasattr(self, "additional_fc") and self.adaptive_softmax is None:
+            x = self.additional_fc(x)
+            x = self.dropout_out_module(x)
+        # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
+        if not self.training and self.need_attn and self.attention is not None:
+            assert attn_scores is not None
+            attn_scores = attn_scores.transpose(0, 2)
+        else:
+            attn_scores = None
+        return x, attn_scores
+
+    def output_layer(self, x):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            if self.share_input_output_embed:
+                x = F.linear(x, self.embed_tokens.weight)
+            else:
+                x = self.fc_out(x)
+        return x
+
+    def get_cached_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+    ) -> Tuple[List[Tensor], List[Tensor], Optional[Tensor]]:
+        cached_state = self.get_incremental_state(incremental_state, "cached_state")
+        assert cached_state is not None
+        prev_hiddens_ = cached_state["prev_hiddens"]
+        assert prev_hiddens_ is not None
+        prev_cells_ = cached_state["prev_cells"]
+        assert prev_cells_ is not None
+        prev_hiddens = [prev_hiddens_[i] for i in range(self.num_layers)]
+        prev_cells = [prev_cells_[j] for j in range(self.num_layers)]
+        input_feed = cached_state[
+            "input_feed"
+        ]  # can be None for decoder-only language models
+        return prev_hiddens, prev_cells, input_feed
+
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        if incremental_state is None or len(incremental_state) == 0:
+            return
+        prev_hiddens, prev_cells, input_feed = self.get_cached_state(incremental_state)
+        prev_hiddens = [p.index_select(0, new_order) for p in prev_hiddens]
+        prev_cells = [p.index_select(0, new_order) for p in prev_cells]
+        if input_feed is not None:
+            input_feed = input_feed.index_select(0, new_order)
+        cached_state_new = torch.jit.annotate(
+            Dict[str, Optional[Tensor]],
+            {
+                "prev_hiddens": torch.stack(prev_hiddens),
+                "prev_cells": torch.stack(prev_cells),
+                "input_feed": input_feed,
+            },
+        )
+        self.set_incremental_state(incremental_state, "cached_state", cached_state_new),
+        return
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.max_target_positions
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.uniform_(m.weight, -0.1, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def LSTM(input_size, hidden_size, **kwargs):
+    m = nn.LSTM(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if "weight" in name or "bias" in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def LSTMCell(input_size, hidden_size, **kwargs):
+    m = nn.LSTMCell(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if "weight" in name or "bias" in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def Linear(in_features, out_features, bias=True, dropout=0.0):
+    """Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    m.weight.data.uniform_(-0.1, 0.1)
+    if bias:
+        m.bias.data.uniform_(-0.1, 0.1)
+    return m
+
+
+@register_model_architecture("lstm", "lstm")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_freeze_embed = getattr(args, "encoder_freeze_embed", False)
+    args.encoder_hidden_size = getattr(
+        args, "encoder_hidden_size", args.encoder_embed_dim
+    )
+    args.encoder_layers = getattr(args, "encoder_layers", 1)
+    args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False)
+    args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout)
+    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_freeze_embed = getattr(args, "decoder_freeze_embed", False)
+    args.decoder_hidden_size = getattr(
+        args, "decoder_hidden_size", args.decoder_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 1)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
+    args.decoder_attention = getattr(args, "decoder_attention", "1")
+    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout)
+    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.adaptive_softmax_cutoff = getattr(
+        args, "adaptive_softmax_cutoff", "10000,50000,200000"
+    )
+
+
+@register_model_architecture("lstm", "lstm_wiseman_iwslt_de_en")
+def lstm_wiseman_iwslt_de_en(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_dropout_in = getattr(args, "encoder_dropout_in", 0)
+    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", 0)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
+    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", 0)
+    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
+    base_architecture(args)
+
+
+@register_model_architecture("lstm", "lstm_luong_wmt_en_de")
+def lstm_luong_wmt_en_de(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1000)
+    args.encoder_layers = getattr(args, "encoder_layers", 4)
+    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", 0)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1000)
+    args.decoder_layers = getattr(args, "decoder_layers", 4)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 1000)
+    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", 0)
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
new file mode 100644
index 0000000000..454f0ac36f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
@@ -0,0 +1,142 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.lstm import Embedding, LSTMDecoder
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1e5
+
+
+@register_model("lstm_lm")
+class LSTMLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
+                            help='decoder hidden size')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='number of decoder layers')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
+                            help='decoder attention')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--residuals', default=False,
+                            action='store_true',
+                            help='applying residuals between LSTM layers')
+
+        # Granular dropout settings (if not specified these default to --dropout)
+        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for decoder input embedding')
+        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for decoder output')
+        parser.add_argument('--share-decoder-input-output-embed', default=False,
+                            action='store_true',
+                            help='share decoder input and output embeddings')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if getattr(args, "max_target_positions", None) is not None:
+            max_target_positions = args.max_target_positions
+        else:
+            max_target_positions = getattr(
+                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+            )
+
+        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+            embed_dict = utils.parse_embedding(embed_path)
+            utils.print_embed_overlap(embed_dict, dictionary)
+            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
+
+        pretrained_decoder_embed = None
+        if args.decoder_embed_path:
+            pretrained_decoder_embed = load_pretrained_embedding_from_file(
+                args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim
+            )
+
+        if args.share_decoder_input_output_embed:
+            # double check all parameters combinations are valid
+            if task.source_dictionary != task.target_dictionary:
+                raise ValueError(
+                    "--share-decoder-input-output-embeddings requires a joint dictionary"
+                )
+
+            if args.decoder_embed_dim != args.decoder_out_embed_dim:
+                raise ValueError(
+                    "--share-decoder-input-output-embeddings requires "
+                    "--decoder-embed-dim to match --decoder-out-embed-dim"
+                )
+
+        decoder = LSTMDecoder(
+            dictionary=task.dictionary,
+            embed_dim=args.decoder_embed_dim,
+            hidden_size=args.decoder_hidden_size,
+            out_embed_dim=args.decoder_out_embed_dim,
+            num_layers=args.decoder_layers,
+            dropout_in=args.decoder_dropout_in,
+            dropout_out=args.decoder_dropout_out,
+            attention=False,  # decoder-only language model doesn't support attention
+            encoder_output_units=0,
+            pretrained_embed=pretrained_decoder_embed,
+            share_input_output_embed=args.share_decoder_input_output_embed,
+            adaptive_softmax_cutoff=(
+                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == "adaptive_loss"
+                else None
+            ),
+            max_target_positions=max_target_positions,
+            residuals=args.residuals,
+        )
+
+        return cls(decoder)
+
+
+@register_model_architecture("lstm_lm", "lstm_lm")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_hidden_size = getattr(
+        args, "decoder_hidden_size", args.decoder_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 1)
+    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
+    args.decoder_attention = getattr(args, "decoder_attention", "0")
+    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout)
+    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.adaptive_softmax_cutoff = getattr(
+        args, "adaptive_softmax_cutoff", "10000,50000,200000"
+    )
+    args.residuals = getattr(args, "residuals", False)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
new file mode 100644
index 0000000000..5cb49dd77c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
@@ -0,0 +1,404 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    LayerNorm,
+    SinusoidalPositionalEmbedding,
+    TransformerSentenceEncoder,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from fairseq.utils import safe_hasattr
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("masked_lm")
+class MaskedLMModel(FairseqEncoderModel):
+    """
+    Class for training a Masked Language Model. It also supports an
+    additional sentence level prediction if the sent-loss argument is set.
+    """
+
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+        # if specified then apply bert initialization on the model. We need
+        # to explictly call this to make sure that the output embeddings
+        # and projection layers are also correctly initialized
+        if getattr(args, "apply_bert_init", False):
+            self.apply(init_bert_params)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # Arguments related to dropout
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for" " attention weights",
+        )
+        parser.add_argument(
+            "--act-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after" " activation in FFN",
+        )
+
+        # Arguments related to hidden states and self-attention
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+
+        # Arguments related to input and output embeddings
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--share-encoder-input-output-embed",
+            action="store_true",
+            help="share encoder input" " and output embeddings",
+        )
+        parser.add_argument(
+            "--encoder-learned-pos",
+            action="store_true",
+            help="use learned positional embeddings in the encoder",
+        )
+        parser.add_argument(
+            "--no-token-positional-embeddings",
+            action="store_true",
+            help="if set, disables positional embeddings" " (outside self attention)",
+        )
+        parser.add_argument(
+            "--num-segment", type=int, metavar="N", help="num segment in the input"
+        )
+        parser.add_argument(
+            "--max-positions", type=int, help="number of positional embeddings to learn"
+        )
+
+        # Arguments related to sentence level prediction
+        parser.add_argument(
+            "--sentence-class-num",
+            type=int,
+            metavar="N",
+            help="number of classes for sentence task",
+        )
+        parser.add_argument(
+            "--sent-loss",
+            action="store_true",
+            help="if set," " calculate sentence level predictions",
+        )
+
+        # Arguments related to parameter initialization
+        parser.add_argument(
+            "--apply-bert-init",
+            action="store_true",
+            help="use custom param initialization for BERT",
+        )
+
+        # misc params
+        parser.add_argument(
+            "--activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--pooler-activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="Which activation function to use for pooler layer.",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+
+    def forward(self, src_tokens, segment_labels=None, **kwargs):
+        return self.encoder(src_tokens, segment_labels=segment_labels, **kwargs)
+
+    def max_positions(self):
+        return self.encoder.max_positions
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not safe_hasattr(args, "max_positions"):
+            args.max_positions = args.tokens_per_sample
+
+        logger.info(args)
+
+        encoder = MaskedLMEncoder(args, task.dictionary)
+        return cls(args, encoder)
+
+
+class MaskedLMEncoder(FairseqEncoder):
+    """
+    Encoder for Masked Language Modelling.
+    """
+
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+
+        self.padding_idx = dictionary.pad()
+        self.vocab_size = dictionary.__len__()
+        self.max_positions = args.max_positions
+
+        self.sentence_encoder = TransformerSentenceEncoder(
+            padding_idx=self.padding_idx,
+            vocab_size=self.vocab_size,
+            num_encoder_layers=args.encoder_layers,
+            embedding_dim=args.encoder_embed_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.act_dropout,
+            max_seq_len=self.max_positions,
+            num_segments=args.num_segment,
+            use_position_embeddings=not args.no_token_positional_embeddings,
+            encoder_normalize_before=args.encoder_normalize_before,
+            apply_bert_init=args.apply_bert_init,
+            activation_fn=args.activation_fn,
+            learned_pos_embedding=args.encoder_learned_pos,
+        )
+
+        self.share_input_output_embed = args.share_encoder_input_output_embed
+        self.embed_out = None
+        self.sentence_projection_layer = None
+        self.sentence_out_dim = args.sentence_class_num
+        self.lm_output_learned_bias = None
+
+        # Remove head is set to true during fine-tuning
+        self.load_softmax = not getattr(args, "remove_head", False)
+
+        self.masked_lm_pooler = nn.Linear(
+            args.encoder_embed_dim, args.encoder_embed_dim
+        )
+        self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn)
+
+        self.lm_head_transform_weight = nn.Linear(
+            args.encoder_embed_dim, args.encoder_embed_dim
+        )
+        self.activation_fn = utils.get_activation_fn(args.activation_fn)
+        self.layer_norm = LayerNorm(args.encoder_embed_dim)
+
+        self.lm_output_learned_bias = None
+        if self.load_softmax:
+            self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size))
+
+            if not self.share_input_output_embed:
+                self.embed_out = nn.Linear(
+                    args.encoder_embed_dim, self.vocab_size, bias=False
+                )
+
+            if args.sent_loss:
+                self.sentence_projection_layer = nn.Linear(
+                    args.encoder_embed_dim, self.sentence_out_dim, bias=False
+                )
+
+    def forward(self, src_tokens, segment_labels=None, masked_tokens=None, **unused):
+        """
+        Forward pass for Masked LM encoder. This first computes the token
+        embedding using the token embedding matrix, position embeddings (if
+        specified) and segment embeddings (if specified).
+
+        Here we assume that the sentence representation corresponds to the
+        output of the classification_token (see bert_task or cross_lingual_lm
+        task for more details).
+        Args:
+            - src_tokens: B x T matrix representing sentences
+            - segment_labels: B x T matrix representing segment label for tokens
+        Returns:
+            - a tuple of the following:
+                - logits for predictions in format B x T x C to be used in
+                  softmax afterwards
+                - a dictionary of additional data, where 'pooled_output' contains
+                  the representation for classification_token and 'inner_states'
+                  is a list of internal model states used to compute the
+                  predictions (similar in ELMO). 'sentence_logits'
+                  is the prediction logit for NSP task and is only computed if
+                  this is specified in the input arguments.
+        """
+
+        inner_states, sentence_rep = self.sentence_encoder(
+            src_tokens,
+            segment_labels=segment_labels,
+        )
+
+        x = inner_states[-1].transpose(0, 1)
+        # project masked tokens only
+        if masked_tokens is not None:
+            x = x[masked_tokens, :]
+        x = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(x)))
+
+        pooled_output = self.pooler_activation(self.masked_lm_pooler(sentence_rep))
+
+        # project back to size of vocabulary
+        if self.share_input_output_embed and hasattr(
+            self.sentence_encoder.embed_tokens, "weight"
+        ):
+            x = F.linear(x, self.sentence_encoder.embed_tokens.weight)
+        elif self.embed_out is not None:
+            x = self.embed_out(x)
+        if self.lm_output_learned_bias is not None:
+            x = x + self.lm_output_learned_bias
+        sentence_logits = None
+        if self.sentence_projection_layer:
+            sentence_logits = self.sentence_projection_layer(pooled_output)
+
+        return x, {
+            "inner_states": inner_states,
+            "pooled_output": pooled_output,
+            "sentence_logits": sentence_logits,
+        }
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.max_positions
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        if isinstance(
+            self.sentence_encoder.embed_positions, SinusoidalPositionalEmbedding
+        ):
+            state_dict[
+                name + ".sentence_encoder.embed_positions._float_tensor"
+            ] = torch.FloatTensor(1)
+        if not self.load_softmax:
+            for k in list(state_dict.keys()):
+                if (
+                    "embed_out.weight" in k
+                    or "sentence_projection_layer.weight" in k
+                    or "lm_output_learned_bias" in k
+                ):
+                    del state_dict[k]
+        return state_dict
+
+
+@register_model_architecture("masked_lm", "masked_lm")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.act_dropout = getattr(args, "act_dropout", 0.0)
+
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.share_encoder_input_output_embed = getattr(
+        args, "share_encoder_input_output_embed", False
+    )
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.num_segment = getattr(args, "num_segment", 2)
+
+    args.sentence_class_num = getattr(args, "sentence_class_num", 2)
+    args.sent_loss = getattr(args, "sent_loss", False)
+
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+
+
+@register_model_architecture("masked_lm", "bert_base")
+def bert_base_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.share_encoder_input_output_embed = getattr(
+        args, "share_encoder_input_output_embed", True
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+    args.num_segment = getattr(args, "num_segment", 2)
+
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
+
+    args.sentence_class_num = getattr(args, "sentence_class_num", 2)
+    args.sent_loss = getattr(args, "sent_loss", True)
+
+    args.apply_bert_init = getattr(args, "apply_bert_init", True)
+
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    base_architecture(args)
+
+
+@register_model_architecture("masked_lm", "bert_large")
+def bert_large_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_layers = getattr(args, "encoder_layers", 24)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    bert_base_architecture(args)
+
+
+@register_model_architecture("masked_lm", "xlm_base")
+def xlm_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.share_encoder_input_output_embed = getattr(
+        args, "share_encoder_input_output_embed", True
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+    args.num_segment = getattr(args, "num_segment", 1)
+
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+
+    args.sent_loss = getattr(args, "sent_loss", False)
+
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.apply_bert_init = getattr(args, "apply_bert_init", True)
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
new file mode 100644
index 0000000000..732d66b1d5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+
+@torch.jit.script
+def script_skip_tensor_list(x: List[Tensor], mask):
+    res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x]
+    outputs = []
+    for i, t in enumerate(res):
+        if t.numel() != 0:
+            outputs.append(t)
+        else:
+            outputs.append(x[i])
+    return outputs
+
+
+@torch.jit.script
+def script_skip_tensor(x: Tensor, mask):
+    # None case
+    if x.size(0) == 0:
+        return x
+    res = x[mask] if x.size(0) == mask.size(0) else x[:, mask]
+    if res.numel() == 0:
+        return x
+    else:
+        return res
+
+
+@torch.jit.script
+def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):
+    """
+    Expand 2D/3D tensor on dim=1
+    """
+    if x is None:
+        return None
+
+    assert x.dim() == 2 or x.dim() == 3
+    assert trg_dim >= x.size(1), (trg_dim, x.size())
+    if trg_dim == x.size(1):
+        return x
+
+    dims = [x.size(0), trg_dim - x.size(1)]
+    if x.dim() == 3:
+        dims.append(x.size(2))
+    x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1)
+
+    return x
+
+
+@torch.jit.script
+def coalesce(x: Optional[Tensor], y: Tensor) -> Tensor:
+    return x if x is not None else y
+
+
+@torch.jit.script
+def fill_tensors(
+    x: Optional[Tensor], mask, y: Optional[Tensor], padding_idx: int
+) -> Optional[Tensor]:
+    """
+    Filling tensor x with y at masked positions (dim=0).
+    """
+    if x is None or x.size()[0] == 0 or y is None:
+        return x
+    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
+    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+
+    n_selected = mask.sum()
+    if n_selected == 0:
+        return x
+    assert n_selected == y.size(0)
+    if n_selected == x.size(0):
+        return y
+
+    if x.size(1) < y.size(1):
+        x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
+        x[mask] = y
+    elif x.size(1) > y.size(1):
+        x[mask] = torch.tensor(padding_idx).type_as(x)
+        if x.dim() == 2:
+            x[mask, : y.size(1)] = y
+        else:
+            x[mask, : y.size(1), :] = y
+    else:
+        x[mask] = y
+    return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
new file mode 100644
index 0000000000..e722b647ed
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqMultiModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import (
+    Embedding,
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+    base_architecture,
+)
+from fairseq.utils import safe_hasattr
+
+
+@register_model("multilingual_transformer")
+class MultilingualTransformerModel(FairseqMultiModel):
+    """Train Transformer models for multiple language pairs simultaneously.
+
+    Requires `--task multilingual_translation`.
+
+    We inherit all arguments from TransformerModel and assume that all language
+    pairs use a single Transformer architecture. In addition, we provide several
+    options that are specific to the multilingual setting.
+
+    Args:
+        --share-encoder-embeddings: share encoder embeddings across all source languages
+        --share-decoder-embeddings: share decoder embeddings across all target languages
+        --share-encoders: share all encoder params (incl. embeddings) across all source languages
+        --share-decoders: share all decoder params (incl. embeddings) across all target languages
+    """
+
+    def __init__(self, encoders, decoders):
+        super().__init__(encoders, decoders)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--share-encoder-embeddings",
+            action="store_true",
+            help="share encoder embeddings across languages",
+        )
+        parser.add_argument(
+            "--share-decoder-embeddings",
+            action="store_true",
+            help="share decoder embeddings across languages",
+        )
+        parser.add_argument(
+            "--share-encoders",
+            action="store_true",
+            help="share encoders across languages",
+        )
+        parser.add_argument(
+            "--share-decoders",
+            action="store_true",
+            help="share decoders across languages",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
+
+        assert isinstance(task, MultilingualTranslationTask)
+
+        # make sure all arguments are present in older models
+        base_multilingual_architecture(args)
+
+        if not safe_hasattr(args, "max_source_positions"):
+            args.max_source_positions = 1024
+        if not safe_hasattr(args, "max_target_positions"):
+            args.max_target_positions = 1024
+
+        src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs]
+        tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs]
+
+        if args.share_encoders:
+            args.share_encoder_embeddings = True
+        if args.share_decoders:
+            args.share_decoder_embeddings = True
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        # build shared embeddings (if applicable)
+        shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None
+        if args.share_all_embeddings:
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                dicts=task.dicts,
+                langs=task.langs,
+                embed_dim=args.encoder_embed_dim,
+                build_embedding=build_embedding,
+                pretrained_embed_path=args.encoder_embed_path,
+            )
+            shared_decoder_embed_tokens = shared_encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            if args.share_encoder_embeddings:
+                shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                    dicts=task.dicts,
+                    langs=src_langs,
+                    embed_dim=args.encoder_embed_dim,
+                    build_embedding=build_embedding,
+                    pretrained_embed_path=args.encoder_embed_path,
+                )
+            if args.share_decoder_embeddings:
+                shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+                    dicts=task.dicts,
+                    langs=tgt_langs,
+                    embed_dim=args.decoder_embed_dim,
+                    build_embedding=build_embedding,
+                    pretrained_embed_path=args.decoder_embed_path,
+                )
+
+        # encoders/decoders for each language
+        lang_encoders, lang_decoders = {}, {}
+
+        def get_encoder(lang):
+            if lang not in lang_encoders:
+                if shared_encoder_embed_tokens is not None:
+                    encoder_embed_tokens = shared_encoder_embed_tokens
+                else:
+                    encoder_embed_tokens = build_embedding(
+                        task.dicts[lang],
+                        args.encoder_embed_dim,
+                        args.encoder_embed_path,
+                    )
+                lang_encoders[lang] = cls._get_module_class(
+                    True, args, task.dicts[lang], encoder_embed_tokens, src_langs
+                )
+            return lang_encoders[lang]
+
+        def get_decoder(lang):
+            if lang not in lang_decoders:
+                if shared_decoder_embed_tokens is not None:
+                    decoder_embed_tokens = shared_decoder_embed_tokens
+                else:
+                    decoder_embed_tokens = build_embedding(
+                        task.dicts[lang],
+                        args.decoder_embed_dim,
+                        args.decoder_embed_path,
+                    )
+                lang_decoders[lang] = cls._get_module_class(
+                    False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs
+                )
+            return lang_decoders[lang]
+
+        # shared encoders/decoders (if applicable)
+        shared_encoder, shared_decoder = None, None
+        if args.share_encoders:
+            shared_encoder = get_encoder(src_langs[0])
+        if args.share_decoders:
+            shared_decoder = get_decoder(tgt_langs[0])
+
+        encoders, decoders = OrderedDict(), OrderedDict()
+        for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs):
+            encoders[lang_pair] = (
+                shared_encoder if shared_encoder is not None else get_encoder(src)
+            )
+            decoders[lang_pair] = (
+                shared_decoder if shared_decoder is not None else get_decoder(tgt)
+            )
+
+        return MultilingualTransformerModel(encoders, decoders)
+
+    @classmethod
+    def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs):
+        module_class = TransformerEncoder if is_encoder else TransformerDecoder
+        return module_class(args, lang_dict, embed_tokens)
+
+    def load_state_dict(self, state_dict, strict=True, model_cfg=None):
+        state_dict_subset = state_dict.copy()
+        for k, _ in state_dict.items():
+            assert k.startswith("models.")
+            lang_pair = k.split(".")[1]
+            if lang_pair not in self.models:
+                del state_dict_subset[k]
+        super().load_state_dict(state_dict_subset, strict=strict, model_cfg=model_cfg)
+
+
+@register_model_architecture("multilingual_transformer", "multilingual_transformer")
+def base_multilingual_architecture(args):
+    base_architecture(args)
+    args.share_encoder_embeddings = getattr(args, "share_encoder_embeddings", False)
+    args.share_decoder_embeddings = getattr(args, "share_decoder_embeddings", False)
+    args.share_encoders = getattr(args, "share_encoders", False)
+    args.share_decoders = getattr(args, "share_decoders", False)
+
+
+@register_model_architecture(
+    "multilingual_transformer", "multilingual_transformer_iwslt_de_en"
+)
+def multilingual_transformer_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    base_multilingual_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
new file mode 100644
index 0000000000..05fe822487
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+from .fairseq_nat_model import *
+from .nonautoregressive_transformer import *
+from .nat_crf_transformer import *
+from .iterative_nonautoregressive_transformer import *
+from .cmlm_transformer import *
+from .levenshtein_transformer import *
+from .insertion_transformer import *
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
new file mode 100644
index 0000000000..c876e9453c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
@@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file implements:
+Ghazvininejad, Marjan, et al.
+"Constant-time machine translation with conditional masked language models."
+arXiv preprint arXiv:1904.09324 (2019).
+"""
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import NATransformerModel
+from fairseq.utils import new_arange
+
+
+def _skeptical_unmasking(output_scores, output_masks, p):
+    sorted_index = output_scores.sort(-1)[1]
+    boundary_len = (
+        (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p
+    ).long()
+    skeptical_mask = new_arange(output_masks) < boundary_len
+    return skeptical_mask.scatter(1, sorted_index, skeptical_mask)
+
+
+@register_model("cmlm_transformer")
+class CMLMNATransformerModel(NATransformerModel):
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        assert not self.decoder.src_embedding_copy, "do not support embedding copy."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+        # length prediction
+        length_out = self.decoder.forward_length(
+            normalize=False, encoder_out=encoder_out
+        )
+        length_tgt = self.decoder.forward_length_prediction(
+            length_out, encoder_out, tgt_tokens
+        )
+
+        # decoding
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+        )
+        word_ins_mask = prev_output_tokens.eq(self.unk)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out,
+                "tgt": tgt_tokens,
+                "mask": word_ins_mask,
+                "ls": self.args.label_smoothing,
+                "nll_loss": True,
+            },
+            "length": {
+                "out": length_out,
+                "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor,
+            },
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+
+        step = decoder_out.step
+        max_step = decoder_out.max_step
+
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # execute the decoder
+        output_masks = output_tokens.eq(self.unk)
+        _scores, _tokens = self.decoder(
+            normalize=True,
+            prev_output_tokens=output_tokens,
+            encoder_out=encoder_out,
+        ).max(-1)
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        # skeptical decoding (depend on the maximum decoding steps.)
+        if (step + 1) < max_step:
+            skeptical_mask = _skeptical_unmasking(
+                output_scores, output_tokens.ne(self.pad), 1 - (step + 1) / max_step
+            )
+
+            output_tokens.masked_fill_(skeptical_mask, self.unk)
+            output_scores.masked_fill_(skeptical_mask, 0.0)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history,
+        )
+
+
+@register_model_architecture("cmlm_transformer", "cmlm_transformer")
+def cmlm_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+
+@register_model_architecture("cmlm_transformer", "cmlm_transformer_wmt_en_de")
+def cmlm_wmt_en_de(args):
+    cmlm_base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
new file mode 100644
index 0000000000..a5594a4ed9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+from fairseq.models.transformer import (
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+def ensemble_encoder(func):
+    def wrapper(self, *args, **kwargs):
+        if self.ensemble_models is None or len(self.ensemble_models) == 1:
+            return func(self, *args, **kwargs)
+        encoder_outs = [
+            func(model, *args, **kwargs, return_all_hiddens=True)
+            for model in self.ensemble_models
+        ]
+        _encoder_out = encoder_outs[0].copy()
+
+        def stack(key):
+            outs = [e[key][0] for e in encoder_outs]
+            return [torch.stack(outs, -1) if outs[0] is not None else None]
+
+        _encoder_out["encoder_out"] = stack("encoder_out")
+        _encoder_out["encoder_embedding"] = stack("encoder_embedding")
+
+        num_layers = len(_encoder_out["encoder_states"])
+        if num_layers > 0:
+            _encoder_out["encoder_states"] = [
+                torch.stack([e["encoder_states"][i] for e in encoder_outs], -1)
+                for i in range(num_layers)
+            ]
+        return _encoder_out
+
+    return wrapper
+
+
+def ensemble_decoder(func):
+    def wrapper(self, normalize=False, encoder_out=None, *args, **kwargs):
+        if self.ensemble_models is None or len(self.ensemble_models) == 1:
+            return func(
+                self, normalize=normalize, encoder_out=encoder_out, *args, **kwargs
+            )
+
+        def _replace(encoder_out, new_val):
+            new_encoder_out = encoder_out.copy()
+            new_encoder_out["encoder_out"] = [new_val]
+            return new_encoder_out
+
+        action_outs = [
+            func(
+                model,
+                normalize=normalize,
+                encoder_out=_replace(
+                    encoder_out, encoder_out["encoder_out"][0][:, :, :, i]
+                ),
+                *args,
+                **kwargs
+            )
+            for i, model in enumerate(self.ensemble_models)
+        ]
+
+        if not isinstance(action_outs[0], tuple):  # return multiple values
+            action_outs = [[a] for a in action_outs]
+        else:
+            action_outs = [list(a) for a in action_outs]
+
+        ensembled_outs = []
+        for i in range(len(action_outs[0])):
+            if i == 0 and normalize:
+                ensembled_outs += [
+                    torch.logsumexp(
+                        torch.stack([a[i] for a in action_outs], -1), dim=-1
+                    )
+                    - math.log(len(self.ensemble_models))
+                ]
+            elif action_outs[0][i] is not None:
+                ensembled_outs += [torch.stack([a[i] for a in action_outs], -1)]
+            else:
+                ensembled_outs += [None]
+
+        if len(ensembled_outs) == 1:
+            return ensembled_outs[0]
+        return tuple(ensembled_outs)
+
+    return wrapper
+
+
+class FairseqNATModel(TransformerModel):
+    """
+    Abstract class for all nonautoregressive-based models
+    """
+
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+        self.tgt_dict = decoder.dictionary
+        self.bos = decoder.dictionary.bos()
+        self.eos = decoder.dictionary.eos()
+        self.pad = decoder.dictionary.pad()
+        self.unk = decoder.dictionary.unk()
+
+        self.ensemble_models = None
+
+    @property
+    def allow_length_beam(self):
+        return False
+
+    @property
+    def allow_ensemble(self):
+        return True
+
+    def enable_ensemble(self, models):
+        self.encoder.ensemble_models = [m.encoder for m in models]
+        self.decoder.ensemble_models = [m.decoder for m in models]
+
+    @staticmethod
+    def add_args(parser):
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--apply-bert-init",
+            action="store_true",
+            help="use custom param initialization for BERT",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = FairseqNATDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        encoder = FairseqNATEncoder(args, src_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            encoder.apply(init_bert_params)
+        return encoder
+
+    def forward_encoder(self, encoder_inputs):
+        return self.encoder(*encoder_inputs)
+
+    def forward_decoder(self, *args, **kwargs):
+        return NotImplementedError
+
+    def initialize_output_tokens(self, *args, **kwargs):
+        return NotImplementedError
+
+    def forward(self, *args, **kwargs):
+        return NotImplementedError
+
+
+class FairseqNATEncoder(TransformerEncoder):
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens)
+        self.ensemble_models = None
+
+    @ensemble_encoder
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+
+class FairseqNATDecoder(TransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
+        self.ensemble_models = None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
new file mode 100644
index 0000000000..bc28000f59
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
@@ -0,0 +1,280 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import (
+    FairseqNATModel,
+    LevenshteinTransformerDecoder,
+    LevenshteinTransformerModel,
+    ensemble_decoder,
+)
+from fairseq.models.transformer import Linear
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from fairseq.utils import new_arange
+
+
+class NegativeDistanceScore(object):
+    def __init__(self):
+
+        # pre-compute some values
+        self.scores = {}
+
+        self.scores[0.5] = self.compute_score_full(50, 0.5)
+        self.scores[1.0] = self.compute_score_full(50, 1.0)
+        self.scores[2.0] = self.compute_score_full(50, 2.0)
+
+    def __call__(self, i, L, tau):
+        if (tau is None) or (tau > 1000):
+            return 1 / L
+
+        if tau in self.scores:
+            if L < self.scores[tau].shape[0]:
+                return self.scores[tau][L - 1, i]
+        return self.compute_score(L, tau)[i]
+
+    def compute_score(self, L, tau):
+        s = np.array([-abs(L / 2 - i) / tau for i in range(L)])
+        s = np.exp(s - s.max())
+        return s / s.sum()
+
+    def compute_score_full(self, L, tau):
+        s = -abs(np.arange(0, L - 1)[:, None] / 2 - np.arange(L)[None, :]) / tau
+        s = np.tril(s, 0) + np.triu(s - float("inf"), 1)
+        s = np.exp(s - s.max(1, keepdims=True))
+        return s / s.sum(1, keepdims=True)
+
+
+neg_scorer = NegativeDistanceScore()
+
+
+def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
+    try:
+        from fairseq import libnat
+    except ImportError as e:
+        import sys
+
+        sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n")
+        raise e
+
+    B = in_tokens.size(0)
+    T = in_tokens.size(1)
+    V = vocab_size
+
+    with torch.cuda.device_of(in_tokens):
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+    full_labels = libnat.suggested_ed2_path(
+        in_tokens_list, out_tokens_list, padding_idx
+    )
+    insert_labels = [a[:-1] for a in full_labels]
+
+    # numericalize1
+    insert_label_tensors = in_tokens.new_zeros(B * (T - 1) * V).float()
+    insert_index, insert_labels = zip(
+        *[
+            (w + (j + i * (T - 1)) * V, neg_scorer(k, len(label), tau))
+            for i, labels in enumerate(insert_labels)
+            for j, label in enumerate(labels[1:-1])
+            for k, w in enumerate(label)
+        ]
+    )  # HACK 1:-1
+    insert_index, insert_labels = [
+        torch.tensor(list(a), device=in_tokens.device)
+        for a in [insert_index, insert_labels]
+    ]
+    insert_label_tensors.scatter_(0, insert_index.long(), insert_labels)
+    insert_label_tensors = insert_label_tensors.view(B, T - 1, V)
+
+    return insert_label_tensors
+
+
+def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, padding_idx):
+
+    padding_masks = in_tokens[:, 1:].eq(padding_idx)
+    word_ins_scores.masked_fill_(padding_masks, 0.0)
+    word_ins_pred.masked_fill_(padding_masks, padding_idx)
+
+    in_coords = new_arange(in_tokens).type_as(in_scores)
+
+    # shift all padding predictions to infinite
+    out_coords = (in_coords[:, 1:] - 0.5).masked_fill(
+        word_ins_pred.eq(padding_idx), float("inf")
+    )
+    out_coords = torch.cat([in_coords, out_coords], 1).sort(-1)[1]
+    out_tokens = torch.cat([in_tokens, word_ins_pred], 1).gather(1, out_coords)
+    out_scores = torch.cat([in_scores, word_ins_scores], 1).gather(1, out_coords)
+    return out_tokens, out_scores
+
+
+@register_model("insertion_transformer")
+class InsertionTransformerModel(LevenshteinTransformerModel):
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        FairseqNATModel.add_args(parser)
+        parser.add_argument("--label-tau", default=None, type=float)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = InsertionTransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        assert tgt_tokens is not None, "forward function only supports training."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # generate training labels for insertion
+        word_ins_out = self.decoder.forward_word_ins(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+        )
+
+        word_ins_tgt = _get_ins_targets(
+            prev_output_tokens,
+            tgt_tokens,
+            self.pad,
+            self.unk,
+            len(self.tgt_dict),
+            tau=self.decoder.label_tau,
+        ).type_as(word_ins_out)
+        word_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out,
+                "tgt": word_ins_tgt,
+                "mask": word_ins_masks,
+                "ls": self.args.label_smoothing,
+                "nll_loss": True,
+            }
+        }
+
+    def forward_decoder(
+        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # TODO: decoding for InsertionTransformer
+        word_ins_score = self.decoder.forward_word_ins(
+            normalize=True, prev_output_tokens=output_tokens, encoder_out=encoder_out
+        )
+
+        if eos_penalty > 0.0:
+            word_ins_score[:, :, self.pad] -= eos_penalty
+        word_ins_score, word_ins_pred = word_ins_score.max(-1)
+        output_tokens, output_scores = _apply_ins_words(
+            output_tokens, output_scores, word_ins_pred, word_ins_score, self.pad
+        )
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history,
+        )
+
+
+class InsertionTransformerDecoder(LevenshteinTransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        # use the TransformerDecoder's __init__
+        super(LevenshteinTransformerDecoder, self).__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+        self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim)
+
+        self.label_tau = getattr(args, "label_tau", None)
+
+    @ensemble_decoder
+    def forward_word_ins(self, normalize, encoder_out, prev_output_tokens):
+        features = self.extract_features(prev_output_tokens, encoder_out=encoder_out)[0]
+        features = self.pool_out(
+            torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        )
+        decoder_out = self.output_layer(features)
+        return F.log_softmax(decoder_out, -1) if normalize else decoder_out
+
+    def forward_mask_ins(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_word_del(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+@register_model_architecture("insertion_transformer", "insertion_transformer")
+def insertion_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # special for insertion transformer
+    args.label_tau = getattr(args, "label_tau", None)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
new file mode 100644
index 0000000000..bc39509980
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
@@ -0,0 +1,228 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import NATransformerModel
+
+
+def _sequential_poisoning(s, V, beta=0.33, bos=2, eos=3, pad=1):
+    # s: input batch
+    # V: vocabulary size
+    rand_words = torch.randint(low=4, high=V, size=s.size(), device=s.device)
+    choices = torch.rand(size=s.size(), device=s.device)
+    choices.masked_fill_((s == pad) | (s == bos) | (s == eos), 1)
+
+    replace = choices < beta / 3
+    repeat = (choices >= beta / 3) & (choices < beta * 2 / 3)
+    swap = (choices >= beta * 2 / 3) & (choices < beta)
+    safe = choices >= beta
+
+    for i in range(s.size(1) - 1):
+        rand_word = rand_words[:, i]
+        next_word = s[:, i + 1]
+        self_word = s[:, i]
+
+        replace_i = replace[:, i]
+        swap_i = swap[:, i] & (next_word != 3)
+        repeat_i = repeat[:, i] & (next_word != 3)
+        safe_i = safe[:, i] | ((next_word == 3) & (~replace_i))
+
+        s[:, i] = (
+            self_word * (safe_i | repeat_i).long()
+            + next_word * swap_i.long()
+            + rand_word * replace_i.long()
+        )
+        s[:, i + 1] = (
+            next_word * (safe_i | replace_i).long()
+            + self_word * (swap_i | repeat_i).long()
+        )
+    return s
+
+
+def gumbel_noise(input, TINY=1e-8):
+    return (
+        input.new_zeros(*input.size())
+        .uniform_()
+        .add_(TINY)
+        .log_()
+        .neg_()
+        .add_(TINY)
+        .log_()
+        .neg_()
+    )
+
+
+@register_model("iterative_nonautoregressive_transformer")
+class IterNATransformerModel(NATransformerModel):
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+        parser.add_argument(
+            "--train-step",
+            type=int,
+            help="number of refinement iterations during training",
+        )
+        parser.add_argument(
+            "--dae-ratio",
+            type=float,
+            help="the probability of switching to the denoising auto-encoder loss",
+        )
+        parser.add_argument(
+            "--stochastic-approx",
+            action="store_true",
+            help="sampling from the decoder as the inputs for next iteration",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        model = super().build_model(args, task)
+        model.train_step = getattr(args, "train_step", 4)
+        model.dae_ratio = getattr(args, "dae_ratio", 0.5)
+        model.stochastic_approx = getattr(args, "stochastic_approx", False)
+        return model
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        B, T = prev_output_tokens.size()
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # length prediction
+        length_out = self.decoder.forward_length(
+            normalize=False, encoder_out=encoder_out
+        )
+        length_tgt = self.decoder.forward_length_prediction(
+            length_out, encoder_out, tgt_tokens
+        )
+
+        # decoding
+        word_ins_outs, word_ins_tgts, word_ins_masks = [], [], []
+        for t in range(self.train_step):
+            word_ins_out = self.decoder(
+                normalize=False,
+                prev_output_tokens=prev_output_tokens,
+                encoder_out=encoder_out,
+                step=t,
+            )
+            word_ins_tgt = tgt_tokens
+            word_ins_mask = word_ins_tgt.ne(self.pad)
+
+            word_ins_outs.append(word_ins_out)
+            word_ins_tgts.append(word_ins_tgt)
+            word_ins_masks.append(word_ins_mask)
+
+            if t < (self.train_step - 1):
+                # prediction for next iteration
+                if self.stochastic_approx:
+                    word_ins_prediction = (
+                        word_ins_out + gumbel_noise(word_ins_out)
+                    ).max(-1)[1]
+                else:
+                    word_ins_prediction = word_ins_out.max(-1)[1]
+
+                prev_output_tokens = prev_output_tokens.masked_scatter(
+                    word_ins_mask, word_ins_prediction[word_ins_mask]
+                )
+
+                if self.dae_ratio > 0:
+                    # we do not perform denoising for the first iteration
+                    corrputed = (
+                        torch.rand(size=(B,), device=prev_output_tokens.device)
+                        < self.dae_ratio
+                    )
+                    corrputed_tokens = _sequential_poisoning(
+                        tgt_tokens[corrputed],
+                        len(self.tgt_dict),
+                        0.33,
+                        self.bos,
+                        self.eos,
+                        self.pad,
+                    )
+                    prev_output_tokens[corrputed] = corrputed_tokens
+
+        # concat everything
+        word_ins_out = torch.cat(word_ins_outs, 0)
+        word_ins_tgt = torch.cat(word_ins_tgts, 0)
+        word_ins_mask = torch.cat(word_ins_masks, 0)
+
+        return {
+            "word_ins": {
+                "out": word_ins_out,
+                "tgt": word_ins_tgt,
+                "mask": word_ins_mask,
+                "ls": self.args.label_smoothing,
+                "nll_loss": True,
+            },
+            "length": {
+                "out": length_out,
+                "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor,
+            },
+        }
+
+
+@register_model_architecture(
+    "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer"
+)
+def inat_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+    args.train_step = getattr(args, "train_step", 4)
+    args.dae_ratio = getattr(args, "dae_ratio", 0.5)
+    args.stochastic_approx = getattr(args, "stochastic_approx", False)
+
+
+@register_model_architecture(
+    "iterative_nonautoregressive_transformer",
+    "iterative_nonautoregressive_transformer_wmt_en_de",
+)
+def iter_nat_wmt_en_de(args):
+    inat_base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
new file mode 100644
index 0000000000..d60d3c52d5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
@@ -0,0 +1,510 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.iterative_refinement_generator import DecoderOut
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder
+from fairseq.models.transformer import Embedding
+from fairseq.modules import TransformerDecoderLayer
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+from .levenshtein_utils import (
+    _apply_del_words,
+    _apply_ins_masks,
+    _apply_ins_words,
+    _fill,
+    _get_del_targets,
+    _get_ins_targets,
+    _skip,
+    _skip_encoder_out,
+)
+
+
+@register_model("levenshtein_transformer")
+class LevenshteinTransformerModel(FairseqNATModel):
+    @property
+    def allow_length_beam(self):
+        return False
+
+    @staticmethod
+    def add_args(parser):
+        FairseqNATModel.add_args(parser)
+        parser.add_argument(
+            "--early-exit",
+            default="6,6,6",
+            type=str,
+            help="number of decoder layers before word_del, mask_ins, word_ins",
+        )
+        parser.add_argument(
+            "--no-share-discriminator",
+            action="store_true",
+            help="separate parameters for discriminator",
+        )
+        parser.add_argument(
+            "--no-share-maskpredictor",
+            action="store_true",
+            help="separate parameters for mask-predictor",
+        )
+        parser.add_argument(
+            "--share-discriminator-maskpredictor",
+            action="store_true",
+            help="share the parameters for both mask-predictor and discriminator",
+        )
+        parser.add_argument(
+            "--sampling-for-deletion",
+            action="store_true",
+            help="instead of argmax, use sampling to predict the tokens",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = LevenshteinTransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+
+        assert tgt_tokens is not None, "forward function only supports training."
+
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # generate training labels for insertion
+        masked_tgt_masks, masked_tgt_tokens, mask_ins_targets = _get_ins_targets(
+            prev_output_tokens, tgt_tokens, self.pad, self.unk
+        )
+        mask_ins_targets = mask_ins_targets.clamp(min=0, max=255)  # for safe prediction
+        mask_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
+
+        mask_ins_out, _ = self.decoder.forward_mask_ins(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+        )
+        word_ins_out, _ = self.decoder.forward_word_ins(
+            normalize=False,
+            prev_output_tokens=masked_tgt_tokens,
+            encoder_out=encoder_out,
+        )
+
+        # make online prediction
+        if self.decoder.sampling_for_deletion:
+            word_predictions = torch.multinomial(
+                F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1
+            ).view(word_ins_out.size(0), -1)
+        else:
+            word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
+
+        word_predictions.masked_scatter_(
+            ~masked_tgt_masks, tgt_tokens[~masked_tgt_masks]
+        )
+
+        # generate training labels for deletion
+        word_del_targets = _get_del_targets(word_predictions, tgt_tokens, self.pad)
+        word_del_out, _ = self.decoder.forward_word_del(
+            normalize=False,
+            prev_output_tokens=word_predictions,
+            encoder_out=encoder_out,
+        )
+        word_del_masks = word_predictions.ne(self.pad)
+
+        return {
+            "mask_ins": {
+                "out": mask_ins_out,
+                "tgt": mask_ins_targets,
+                "mask": mask_ins_masks,
+                "ls": 0.01,
+            },
+            "word_ins": {
+                "out": word_ins_out,
+                "tgt": tgt_tokens,
+                "mask": masked_tgt_masks,
+                "ls": self.args.label_smoothing,
+                "nll_loss": True,
+            },
+            "word_del": {
+                "out": word_del_out,
+                "tgt": word_del_targets,
+                "mask": word_del_masks,
+            },
+        }
+
+    def forward_decoder(
+        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        attn = decoder_out.attn
+        history = decoder_out.history
+
+        bsz = output_tokens.size(0)
+        if max_ratio is None:
+            max_lens = torch.zeros_like(output_tokens).fill_(255)
+        else:
+            if not encoder_out["encoder_padding_mask"]:
+                max_src_len = encoder_out["encoder_out"].size(0)
+                src_lens = encoder_out["encoder_out"].new(bsz).fill_(max_src_len)
+            else:
+                src_lens = (~encoder_out["encoder_padding_mask"][0]).sum(1)
+            max_lens = (src_lens * max_ratio).clamp(min=10).long()
+
+        # delete words
+        # do not delete tokens if it is <s> </s>
+        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
+        if can_del_word.sum() != 0:  # we cannot delete, skip
+            word_del_score, word_del_attn = self.decoder.forward_word_del(
+                normalize=True,
+                prev_output_tokens=_skip(output_tokens, can_del_word),
+                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_del_word),
+            )
+            word_del_pred = word_del_score.max(-1)[1].bool()
+
+            _tokens, _scores, _attn = _apply_del_words(
+                output_tokens[can_del_word],
+                output_scores[can_del_word],
+                word_del_attn,
+                word_del_pred,
+                self.pad,
+                self.bos,
+                self.eos,
+            )
+            output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_del_word, _scores, 0)
+            attn = _fill(attn, can_del_word, _attn, 0.0)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        # insert placeholders
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
+        if can_ins_mask.sum() != 0:
+            mask_ins_score, _ = self.decoder.forward_mask_ins(
+                normalize=True,
+                prev_output_tokens=_skip(output_tokens, can_ins_mask),
+                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_mask),
+            )
+            if eos_penalty > 0.0:
+                mask_ins_score[:, :, 0] = mask_ins_score[:, :, 0] - eos_penalty
+            mask_ins_pred = mask_ins_score.max(-1)[1]
+            mask_ins_pred = torch.min(
+                mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
+            )
+
+            _tokens, _scores = _apply_ins_masks(
+                output_tokens[can_ins_mask],
+                output_scores[can_ins_mask],
+                mask_ins_pred,
+                self.pad,
+                self.unk,
+                self.eos,
+            )
+            output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        # insert words
+        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
+        if can_ins_word.sum() != 0:
+            word_ins_score, word_ins_attn = self.decoder.forward_word_ins(
+                normalize=True,
+                prev_output_tokens=_skip(output_tokens, can_ins_word),
+                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_word),
+            )
+            word_ins_score, word_ins_pred = word_ins_score.max(-1)
+            _tokens, _scores = _apply_ins_words(
+                output_tokens[can_ins_word],
+                output_scores[can_ins_word],
+                word_ins_pred,
+                word_ins_score,
+                self.unk,
+            )
+
+            output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
+            output_scores = _fill(output_scores, can_ins_word, _scores, 0)
+            attn = _fill(attn, can_ins_word, word_ins_attn, 0.0)
+
+            if history is not None:
+                history.append(output_tokens.clone())
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        attn = None if attn is None else attn[:, :cut_off, :]
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=attn,
+            history=history,
+        )
+
+    def initialize_output_tokens(self, encoder_out, src_tokens):
+        initial_output_tokens = src_tokens.new_zeros(src_tokens.size(0), 2)
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens[:, 1] = self.eos
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(encoder_out["encoder_out"][0])
+
+        return DecoderOut(
+            output_tokens=initial_output_tokens,
+            output_scores=initial_output_scores,
+            attn=None,
+            step=0,
+            max_step=0,
+            history=None,
+        )
+
+
+class LevenshteinTransformerDecoder(FairseqNATDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+        self.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
+        self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None)
+        self.embed_word_del = Embedding(2, self.output_embed_dim, None)
+
+        # del_word, ins_mask, ins_word
+        self.early_exit = [int(i) for i in args.early_exit.split(",")]
+        assert len(self.early_exit) == 3
+
+        # copy layers for mask-predict/deletion
+        self.layers_msk = None
+        if getattr(args, "no_share_maskpredictor", False):
+            self.layers_msk = nn.ModuleList(
+                [
+                    TransformerDecoderLayer(args, no_encoder_attn)
+                    for _ in range(self.early_exit[1])
+                ]
+            )
+        self.layers_del = None
+        if getattr(args, "no_share_discriminator", False):
+            self.layers_del = nn.ModuleList(
+                [
+                    TransformerDecoderLayer(args, no_encoder_attn)
+                    for _ in range(self.early_exit[0])
+                ]
+            )
+
+        if getattr(args, "share_discriminator_maskpredictor", False):
+            assert getattr(
+                args, "no_share_discriminator", False
+            ), "must set saperate discriminator"
+            self.layers_msk = self.layers_del
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        early_exit=None,
+        layers=None,
+        **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+        Inputs:
+            prev_output_tokens: Tensor(B, T)
+            encoder_out: a dictionary of hidden states and masks
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+            the LevenshteinTransformer decoder has full-attention to all generated tokens
+        """
+        # embed positions
+        positions = (
+            self.embed_positions(prev_output_tokens)
+            if self.embed_positions is not None
+            else None
+        )
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+
+        # decoder layers
+        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        layers = self.layers if layers is None else layers
+        early_exit = len(layers) if early_exit is None else early_exit
+        for _, layer in enumerate(layers[:early_exit]):
+            x, attn, _ = layer(
+                x,
+                encoder_out["encoder_out"][0]
+                if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0)
+                else None,
+                encoder_out["encoder_padding_mask"][0]
+                if (
+                    encoder_out is not None
+                    and len(encoder_out["encoder_padding_mask"]) > 0
+                )
+                else None,
+                self_attn_mask=None,
+                self_attn_padding_mask=decoder_padding_mask,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    @ensemble_decoder
+    def forward_mask_ins(self, normalize, encoder_out, prev_output_tokens, **unused):
+        features, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            early_exit=self.early_exit[1],
+            layers=self.layers_msk,
+            **unused
+        )
+        features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
+        decoder_out = F.linear(features_cat, self.embed_mask_ins.weight)
+        if normalize:
+            return F.log_softmax(decoder_out, -1), extra["attn"]
+        return decoder_out, extra["attn"]
+
+    @ensemble_decoder
+    def forward_word_ins(self, normalize, encoder_out, prev_output_tokens, **unused):
+        features, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            early_exit=self.early_exit[2],
+            layers=self.layers,
+            **unused
+        )
+        decoder_out = self.output_layer(features)
+        if normalize:
+            return F.log_softmax(decoder_out, -1), extra["attn"]
+        return decoder_out, extra["attn"]
+
+    @ensemble_decoder
+    def forward_word_del(self, normalize, encoder_out, prev_output_tokens, **unused):
+        features, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            early_exit=self.early_exit[0],
+            layers=self.layers_del,
+            **unused
+        )
+        decoder_out = F.linear(features, self.embed_word_del.weight)
+        if normalize:
+            return F.log_softmax(decoder_out, -1), extra["attn"]
+        return decoder_out, extra["attn"]
+
+
+@register_model_architecture("levenshtein_transformer", "levenshtein_transformer")
+def levenshtein_base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.early_exit = getattr(args, "early_exit", "6,6,6")
+    args.no_share_discriminator = getattr(args, "no_share_discriminator", False)
+    args.no_share_maskpredictor = getattr(args, "no_share_maskpredictor", False)
+    args.share_discriminator_maskpredictor = getattr(
+        args, "share_discriminator_maskpredictor", False
+    )
+    args.no_share_last_layer = getattr(args, "no_share_last_layer", False)
+
+
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de"
+)
+def levenshtein_transformer_wmt_en_de(args):
+    levenshtein_base_architecture(args)
+
+
+# similar parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_vaswani_wmt_en_de_big"
+)
+def levenshtein_transformer_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    levenshtein_base_architecture(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture(
+    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de_big"
+)
+def levenshtein_transformer_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    levenshtein_transformer_vaswani_wmt_en_de_big(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
new file mode 100644
index 0000000000..375a98c2e1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
@@ -0,0 +1,293 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq.utils import new_arange
+
+
+# -------------- Helper Functions --------------------------------------------------- #
+
+
+def load_libnat():
+    try:
+        from fairseq import libnat_cuda
+
+        return libnat_cuda, True
+
+    except ImportError as e:
+        print(str(e) + "... fall back to CPU version")
+
+        try:
+            from fairseq import libnat
+
+            return libnat, False
+
+        except ImportError as e:
+            import sys
+
+            sys.stderr.write(
+                "ERROR: missing libnat_cuda. run `python setup.py build_ext --inplace`\n"
+            )
+            raise e
+
+
+def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
+    libnat, use_cuda = load_libnat()
+
+    def _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx):
+        in_masks = in_tokens.ne(padding_idx)
+        out_masks = out_tokens.ne(padding_idx)
+        mask_ins_targets, masked_tgt_masks = libnat.generate_insertion_labels(
+            out_tokens.int(),
+            libnat.levenshtein_distance(
+                in_tokens.int(),
+                out_tokens.int(),
+                in_masks.sum(1).int(),
+                out_masks.sum(1).int(),
+            ),
+        )
+        masked_tgt_masks = masked_tgt_masks.bool() & out_masks
+        mask_ins_targets = mask_ins_targets.type_as(in_tokens)[
+            :, 1 : in_masks.size(1)
+        ].masked_fill_(~in_masks[:, 1:], 0)
+        masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
+        return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
+
+    def _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx):
+        in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
+
+        in_tokens_list = [
+            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
+        ]
+        out_tokens_list = [
+            [t for t in s if t != padding_idx]
+            for i, s in enumerate(out_tokens.tolist())
+        ]
+
+        full_labels = libnat.suggested_ed2_path(
+            in_tokens_list, out_tokens_list, padding_idx
+        )
+        mask_inputs = [
+            [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels
+        ]
+
+        # generate labels
+        masked_tgt_masks = []
+        for mask_input in mask_inputs:
+            mask_label = []
+            for beam_size in mask_input[1:-1]:  # HACK 1:-1
+                mask_label += [0] + [1 for _ in range(beam_size)]
+            masked_tgt_masks.append(
+                mask_label + [0 for _ in range(out_seq_len - len(mask_label))]
+            )
+        mask_ins_targets = [
+            mask_input[1:-1]
+            + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))]
+            for mask_input in mask_inputs
+        ]
+
+        # transform to tensor
+        masked_tgt_masks = torch.tensor(
+            masked_tgt_masks, device=out_tokens.device
+        ).bool()
+        mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
+        masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
+        return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
+
+    if use_cuda:
+        return _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx)
+    return _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx)
+
+
+def _get_del_targets(in_tokens, out_tokens, padding_idx):
+    libnat, use_cuda = load_libnat()
+
+    def _get_del_targets_cuda(in_tokens, out_tokens, padding_idx):
+        in_masks = in_tokens.ne(padding_idx)
+        out_masks = out_tokens.ne(padding_idx)
+
+        word_del_targets = libnat.generate_deletion_labels(
+            in_tokens.int(),
+            libnat.levenshtein_distance(
+                in_tokens.int(),
+                out_tokens.int(),
+                in_masks.sum(1).int(),
+                out_masks.sum(1).int(),
+            ),
+        )
+        word_del_targets = word_del_targets.type_as(in_tokens).masked_fill_(
+            ~in_masks, 0
+        )
+        return word_del_targets
+
+    def _get_del_targets_cpu(in_tokens, out_tokens, padding_idx):
+        out_seq_len = out_tokens.size(1)
+        with torch.cuda.device_of(in_tokens):
+            in_tokens_list = [
+                [t for t in s if t != padding_idx]
+                for i, s in enumerate(in_tokens.tolist())
+            ]
+            out_tokens_list = [
+                [t for t in s if t != padding_idx]
+                for i, s in enumerate(out_tokens.tolist())
+            ]
+
+        full_labels = libnat.suggested_ed2_path(
+            in_tokens_list, out_tokens_list, padding_idx
+        )
+        word_del_targets = [b[-1] for b in full_labels]
+        word_del_targets = [
+            labels + [0 for _ in range(out_seq_len - len(labels))]
+            for labels in word_del_targets
+        ]
+
+        # transform to tensor
+        word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
+        return word_del_targets
+
+    if use_cuda:
+        return _get_del_targets_cuda(in_tokens, out_tokens, padding_idx)
+    return _get_del_targets_cpu(in_tokens, out_tokens, padding_idx)
+
+
+def _apply_ins_masks(
+    in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx
+):
+
+    in_masks = in_tokens.ne(padding_idx)
+    in_lengths = in_masks.sum(1)
+
+    # HACK: hacky way to shift all the paddings to eos first.
+    in_tokens.masked_fill_(~in_masks, eos_idx)
+    mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
+
+    out_lengths = in_lengths + mask_ins_pred.sum(1)
+    out_max_len = out_lengths.max()
+    out_masks = new_arange(out_lengths, out_max_len)[None, :] < out_lengths[:, None]
+
+    reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
+    out_tokens = (
+        in_tokens.new_zeros(in_tokens.size(0), out_max_len)
+        .fill_(padding_idx)
+        .masked_fill_(out_masks, unk_idx)
+    )
+    out_tokens[:, 0] = in_tokens[:, 0]
+    out_tokens.scatter_(1, reordering, in_tokens[:, 1:])
+
+    out_scores = None
+    if in_scores is not None:
+        in_scores.masked_fill_(~in_masks, 0)
+        out_scores = in_scores.new_zeros(*out_tokens.size())
+        out_scores[:, 0] = in_scores[:, 0]
+        out_scores.scatter_(1, reordering, in_scores[:, 1:])
+
+    return out_tokens, out_scores
+
+
+def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx):
+    word_ins_masks = in_tokens.eq(unk_idx)
+    out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks])
+
+    if in_scores is not None:
+        out_scores = in_scores.masked_scatter(
+            word_ins_masks, word_ins_scores[word_ins_masks]
+        )
+    else:
+        out_scores = None
+
+    return out_tokens, out_scores
+
+
+def _apply_del_words(
+    in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx
+):
+    # apply deletion to a tensor
+    in_masks = in_tokens.ne(padding_idx)
+    bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
+
+    max_len = in_tokens.size(1)
+    word_del_pred.masked_fill_(~in_masks, 1)
+    word_del_pred.masked_fill_(bos_eos_masks, 0)
+
+    reordering = new_arange(in_tokens).masked_fill_(word_del_pred, max_len).sort(1)[1]
+
+    out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering)
+
+    out_scores = None
+    if in_scores is not None:
+        out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
+
+    out_attn = None
+    if in_attn is not None:
+        _mask = word_del_pred[:, :, None].expand_as(in_attn)
+        _reordering = reordering[:, :, None].expand_as(in_attn)
+        out_attn = in_attn.masked_fill(_mask, 0.0).gather(1, _reordering)
+
+    return out_tokens, out_scores, out_attn
+
+
+def _skip(x, mask):
+    """
+    Getting sliced (dim=0) tensor by mask. Supporting tensor and list/dict of tensors.
+    """
+    if isinstance(x, int):
+        return x
+
+    if x is None:
+        return None
+
+    if isinstance(x, torch.Tensor):
+        if x.size(0) == mask.size(0):
+            return x[mask]
+        elif x.size(1) == mask.size(0):
+            return x[:, mask]
+
+    if isinstance(x, list):
+        return [_skip(x_i, mask) for x_i in x]
+
+    if isinstance(x, dict):
+        return {k: _skip(v, mask) for k, v in x.items()}
+
+    raise NotImplementedError
+
+
+def _skip_encoder_out(encoder, encoder_out, mask):
+    if not mask.any():
+        return encoder_out
+    else:
+        return encoder.reorder_encoder_out(
+            encoder_out, mask.nonzero(as_tuple=False).squeeze()
+        )
+
+
+def _fill(x, mask, y, padding_idx):
+    """
+    Filling tensor x with y at masked positions (dim=0).
+    """
+    if x is None:
+        return y
+    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
+    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+    n_selected = mask.sum()
+    assert n_selected == y.size(0)
+
+    if n_selected == x.size(0):
+        return y
+
+    if x.size(1) < y.size(1):
+        dims = [x.size(0), y.size(1) - x.size(1)]
+        if x.dim() == 3:
+            dims.append(x.size(2))
+        x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
+        x[mask] = y
+    elif x.size(1) > y.size(1):
+        x[mask] = padding_idx
+        if x.dim() == 2:
+            x[mask, : y.size(1)] = y
+        else:
+            x[mask, : y.size(1), :] = y
+    else:
+        x[mask] = y
+    return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
new file mode 100644
index 0000000000..d4b3cd931c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import NATransformerModel, base_architecture
+from fairseq.modules import DynamicCRF
+
+
+@register_model("nacrf_transformer")
+class NACRFTransformerModel(NATransformerModel):
+    def __init__(self, args, encoder, decoder):
+        super().__init__(args, encoder, decoder)
+        self.crf_layer = DynamicCRF(
+            num_embedding=len(self.tgt_dict),
+            low_rank=args.crf_lowrank_approx,
+            beam_size=args.crf_beam_approx,
+        )
+
+    @property
+    def allow_ensemble(self):
+        return False
+
+    @staticmethod
+    def add_args(parser):
+        NATransformerModel.add_args(parser)
+        parser.add_argument(
+            "--crf-lowrank-approx",
+            type=int,
+            help="the dimension of low-rank approximation of transition",
+        )
+        parser.add_argument(
+            "--crf-beam-approx",
+            type=int,
+            help="the beam size for apporixmating the normalizing factor",
+        )
+        parser.add_argument(
+            "--word-ins-loss-factor",
+            type=float,
+            help="weights on NAT loss used to co-training with CRF loss.",
+        )
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # length prediction
+        length_out = self.decoder.forward_length(
+            normalize=False, encoder_out=encoder_out
+        )
+        length_tgt = self.decoder.forward_length_prediction(
+            length_out, encoder_out, tgt_tokens
+        )
+
+        # decoding
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+        )
+        word_ins_tgt, word_ins_mask = tgt_tokens, tgt_tokens.ne(self.pad)
+
+        # compute the log-likelihood of CRF
+        crf_nll = -self.crf_layer(word_ins_out, word_ins_tgt, word_ins_mask)
+        crf_nll = (crf_nll / word_ins_mask.type_as(crf_nll).sum(-1)).mean()
+
+        return {
+            "word_ins": {
+                "out": word_ins_out,
+                "tgt": word_ins_tgt,
+                "mask": word_ins_mask,
+                "ls": self.args.label_smoothing,
+                "nll_loss": True,
+                "factor": self.args.word_ins_loss_factor,
+            },
+            "word_crf": {"loss": crf_nll},
+            "length": {
+                "out": length_out,
+                "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor,
+            },
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # execute the decoder and get emission scores
+        output_masks = output_tokens.ne(self.pad)
+        word_ins_out = self.decoder(
+            normalize=False, prev_output_tokens=output_tokens, encoder_out=encoder_out
+        )
+
+        # run viterbi decoding through CRF
+        _scores, _tokens = self.crf_layer.forward_decoder(word_ins_out, output_masks)
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history,
+        )
+
+
+@register_model_architecture("nacrf_transformer", "nacrf_transformer")
+def nacrf_base_architecture(args):
+    args.crf_lowrank_approx = getattr(args, "crf_lowrank_approx", 32)
+    args.crf_beam_approx = getattr(args, "crf_beam_approx", 64)
+    args.word_ins_loss_factor = getattr(args, "word_ins_loss_factor", 0.5)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
new file mode 100644
index 0000000000..0a0221f9c4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
@@ -0,0 +1,254 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq.models.nat import (
+    _apply_del_words,
+    _apply_ins_masks,
+    _apply_ins_words,
+    _fill,
+    _skip,
+    _skip_encoder_out,
+)
+
+
+class _EnsembleModelEncoder(object):
+    def __init__(self, models):
+        self.models = models
+
+    def reorder_encoder_out(self, encoder_outs, new_order):
+        encoder_outs = [
+            model.encoder.reorder_encoder_out(encoder_out, new_order)
+            for model, encoder_out in zip(self.models, encoder_outs)
+        ]
+        return encoder_outs
+
+
+class BasicEnsembleModel(torch.nn.Module):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__()
+        self.models = torch.nn.ModuleList(models)
+        self.bos = self.models[0].decoder.dictionary.bos()
+        self.eos = self.models[0].decoder.dictionary.eos()
+        self.pad = self.models[0].decoder.dictionary.pad()
+        self.unk = self.models[0].decoder.dictionary.unk()
+        self.encoder = _EnsembleModelEncoder(self.models)
+
+    def has_encoder(self):
+        return hasattr(self.models[0], "encoder")
+
+    def max_decoder_positions(self):
+        return min(m.max_decoder_positions() for m in self.models)
+
+    @torch.no_grad()
+    def forward_encoder(self, encoder_input):
+        if not self.has_encoder():
+            return None
+        return [model.forward_encoder(encoder_input) for model in self.models]
+
+    @torch.no_grad()
+    def forward_decoder(self, *inputs):
+        raise NotImplementedError
+
+    def initialize_output_tokens(self, *inputs):
+        raise NotImplementedError
+
+
+class EnsembleLevT(BasicEnsembleModel):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__(models)
+
+    @torch.no_grad()
+    def forward_decoder(
+        self, decoder_out, encoder_outs, eos_penalty=0.0, max_ratio=None, **kwargs
+    ):
+        # LevT ensembling
+        # A pipeline of three steps: deletion, placeholder, and word insertion.
+        # We need to average scores in each step in a pipeline way because of dependence.
+        # deletion
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        attn = decoder_out.attn
+
+        bsz = output_tokens.size(0)
+        if max_ratio is None:
+            max_lens = output_tokens.new().fill_(255)
+        else:
+            if not encoder_outs[0]["encoder_padding_mask"]:
+                src_lens = (
+                    encoder_outs[0]["encoder_out"][0]
+                    .new(bsz)
+                    .fill_(encoder_outs[0]["encoder_out"][0].size(1))
+                )
+            else:
+                src_lens = (~encoder_outs[0]["encoder_padding_mask"][0]).sum(1)
+            max_lens = (src_lens * max_ratio).clamp(min=10).long()
+
+        # delete words
+        # do not delete tokens if it is <s> </s>
+        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
+        if can_del_word.sum() != 0:  # we cannot delete, skip
+            output_tokens, output_scores, attn = self.forward_word_del(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                attn,
+                can_del_word,
+            )
+
+        # insert placeholders
+        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
+        if can_ins_mask.sum() != 0:
+            output_tokens, output_scores = self.forward_mask_ins(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                can_ins_mask,
+                eos_penalty,
+                max_lens,
+            )
+
+        # insert words
+        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
+        if can_ins_word.sum() != 0:
+            output_tokens, output_scores, attn = self.forward_word_ins(
+                encoder_outs,
+                output_tokens,
+                output_scores,
+                attn,
+                can_ins_word,
+            )
+
+        # delete some unnecessary paddings
+        cut_off = output_tokens.ne(self.pad).sum(1).max()
+        output_tokens = output_tokens[:, :cut_off]
+        output_scores = output_scores[:, :cut_off]
+        attn = None if attn is None else attn[:, :cut_off, :]
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=attn,
+            history=None,
+        )
+
+    def forward_word_del(
+        self, encoder_outs, output_tokens, output_scores, attn, can_del_word
+    ):
+        word_del_score_avg = []
+        word_del_attn_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            word_del_out, word_del_attn = model.decoder.forward_word_del(
+                _skip(output_tokens, can_del_word),
+                _skip_encoder_out(model.encoder, encoder_out, can_del_word),
+            )
+            word_del_score = F.log_softmax(word_del_out, 2)
+            word_del_score_avg.append(word_del_score)
+            word_del_attn_avg.append(word_del_attn)
+        word_del_score_avg = torch.logsumexp(
+            torch.stack(word_del_score_avg, dim=0), dim=0
+        ) - math.log(len(self.models))
+        word_del_pred = word_del_score_avg.max(-1)[1].bool()
+        if word_del_attn_avg[0] is not None:
+            word_del_attn_avg = torch.stack(word_del_attn_avg, dim=0) / len(self.models)
+        else:
+            word_del_attn_avg = None
+
+        _tokens, _scores, _attn = _apply_del_words(
+            output_tokens[can_del_word],
+            output_scores[can_del_word],
+            word_del_attn_avg,
+            word_del_pred,
+            self.pad,
+            self.bos,
+            self.eos,
+        )
+        output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_del_word, _scores, 0)
+        attn = _fill(attn, can_del_word, _attn, 0.0)
+        return output_tokens, output_scores, attn
+
+    def forward_mask_ins(
+        self,
+        encoder_outs,
+        output_tokens,
+        output_scores,
+        can_ins_mask,
+        eos_penalty,
+        max_lens,
+    ):
+        mask_ins_score_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            mask_ins_out, _ = model.decoder.forward_mask_ins(
+                _skip(output_tokens, can_ins_mask),
+                _skip_encoder_out(model.encoder, encoder_out, can_ins_mask),
+            )
+            mask_ins_score = F.log_softmax(mask_ins_out, 2)
+            if eos_penalty > 0.0:
+                mask_ins_score[:, :, 0] -= eos_penalty
+            mask_ins_score_avg.append(mask_ins_score)
+        mask_ins_score_avg = torch.logsumexp(
+            torch.stack(mask_ins_score_avg, dim=0), dim=0
+        ) - math.log(len(self.models))
+        mask_ins_pred = mask_ins_score_avg.max(-1)[1]
+        mask_ins_pred = torch.min(
+            mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
+        )
+        _tokens, _scores = _apply_ins_masks(
+            output_tokens[can_ins_mask],
+            output_scores[can_ins_mask],
+            mask_ins_pred,
+            self.pad,
+            self.unk,
+            self.eos,
+        )
+        output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
+        return output_tokens, output_scores
+
+    def forward_word_ins(
+        self, encoder_outs, output_tokens, output_scores, attn, can_ins_word
+    ):
+        word_ins_score_avg = []
+        word_ins_attn_avg = []
+        for model, encoder_out in zip(self.models, encoder_outs):
+            word_ins_out, word_ins_attn = model.decoder.forward_word_ins(
+                _skip(output_tokens, can_ins_word),
+                _skip_encoder_out(model.encoder, encoder_out, can_ins_word),
+            )
+            word_ins_score = F.log_softmax(word_ins_out, 2)
+            word_ins_score_avg.append(word_ins_score)
+            word_ins_attn_avg.append(word_ins_attn)
+        word_ins_score_avg = torch.logsumexp(
+            torch.stack(word_ins_score_avg, dim=0), dim=0
+        ) - math.log(len(self.models))
+        if word_ins_attn_avg[0] is not None:
+            word_ins_attn_avg = torch.stack(word_ins_attn_avg, dim=0) / len(self.models)
+        else:
+            word_ins_attn_avg = None
+        word_ins_score_max, word_ins_pred = word_ins_score_avg.max(-1)
+
+        _tokens, _scores = _apply_ins_words(
+            output_tokens[can_ins_word],
+            output_scores[can_ins_word],
+            word_ins_pred,
+            word_ins_score_max,
+            self.unk,
+        )
+
+        output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
+        output_scores = _fill(output_scores, can_ins_word, _scores, 0)
+        attn = _fill(attn, can_ins_word, word_ins_attn, 0.0)
+        return output_tokens, output_scores, attn
+
+    def initialize_output_tokens(self, encoder_outs, src_tokens):
+        # LevT doesn't do length prediction.
+        return self.models[0].initialize_output_tokens(encoder_outs[0], src_tokens)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
new file mode 100644
index 0000000000..d114202d25
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
@@ -0,0 +1,456 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.iterative_refinement_generator import DecoderOut
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder
+from fairseq.models.transformer import Embedding
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+def _mean_pooling(enc_feats, src_masks):
+    # enc_feats: T x B x C
+    # src_masks: B x T or None
+    if src_masks is None:
+        enc_feats = enc_feats.mean(0)
+    else:
+        src_masks = (~src_masks).transpose(0, 1).type_as(enc_feats)
+        enc_feats = (
+            (enc_feats / src_masks.sum(0)[None, :, None]) * src_masks[:, :, None]
+        ).sum(0)
+    return enc_feats
+
+
+def _argmax(x, dim):
+    return (x == x.max(dim, keepdim=True)[0]).type_as(x)
+
+
+def _uniform_assignment(src_lens, trg_lens):
+    max_trg_len = trg_lens.max()
+    steps = (src_lens.float() - 1) / (trg_lens.float() - 1)  # step-size
+    # max_trg_len
+    index_t = utils.new_arange(trg_lens, max_trg_len).float()
+    index_t = steps[:, None] * index_t[None, :]  # batch_size X max_trg_len
+    index_t = torch.round(index_t).long().detach()
+    return index_t
+
+
+@register_model("nonautoregressive_transformer")
+class NATransformerModel(FairseqNATModel):
+    @property
+    def allow_length_beam(self):
+        return True
+
+    @staticmethod
+    def add_args(parser):
+        FairseqNATModel.add_args(parser)
+
+        # length prediction
+        parser.add_argument(
+            "--src-embedding-copy",
+            action="store_true",
+            help="copy encoder word embeddings as the initial input of the decoder",
+        )
+        parser.add_argument(
+            "--pred-length-offset",
+            action="store_true",
+            help="predicting the length difference between the target and source sentences",
+        )
+        parser.add_argument(
+            "--sg-length-pred",
+            action="store_true",
+            help="stop the gradients back-propagated from the length predictor",
+        )
+        parser.add_argument(
+            "--length-loss-factor",
+            type=float,
+            help="weights on the length prediction loss",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = NATransformerDecoder(args, tgt_dict, embed_tokens)
+        if getattr(args, "apply_bert_init", False):
+            decoder.apply(init_bert_params)
+        return decoder
+
+    def forward(
+        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
+    ):
+        # encoding
+        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+        # length prediction
+        length_out = self.decoder.forward_length(
+            normalize=False, encoder_out=encoder_out
+        )
+        length_tgt = self.decoder.forward_length_prediction(
+            length_out, encoder_out, tgt_tokens
+        )
+
+        # decoding
+        word_ins_out = self.decoder(
+            normalize=False,
+            prev_output_tokens=prev_output_tokens,
+            encoder_out=encoder_out,
+        )
+
+        return {
+            "word_ins": {
+                "out": word_ins_out,
+                "tgt": tgt_tokens,
+                "mask": tgt_tokens.ne(self.pad),
+                "ls": self.args.label_smoothing,
+                "nll_loss": True,
+            },
+            "length": {
+                "out": length_out,
+                "tgt": length_tgt,
+                "factor": self.decoder.length_loss_factor,
+            },
+        }
+
+    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
+        step = decoder_out.step
+        output_tokens = decoder_out.output_tokens
+        output_scores = decoder_out.output_scores
+        history = decoder_out.history
+
+        # execute the decoder
+        output_masks = output_tokens.ne(self.pad)
+        _scores, _tokens = self.decoder(
+            normalize=True,
+            prev_output_tokens=output_tokens,
+            encoder_out=encoder_out,
+            step=step,
+        ).max(-1)
+
+        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
+        output_scores.masked_scatter_(output_masks, _scores[output_masks])
+        if history is not None:
+            history.append(output_tokens.clone())
+
+        return decoder_out._replace(
+            output_tokens=output_tokens,
+            output_scores=output_scores,
+            attn=None,
+            history=history,
+        )
+
+    def initialize_output_tokens(self, encoder_out, src_tokens):
+        # length prediction
+        length_tgt = self.decoder.forward_length_prediction(
+            self.decoder.forward_length(normalize=True, encoder_out=encoder_out),
+            encoder_out=encoder_out,
+        )
+
+        max_length = length_tgt.clamp_(min=2).max()
+        idx_length = utils.new_arange(src_tokens, max_length)
+
+        initial_output_tokens = src_tokens.new_zeros(
+            src_tokens.size(0), max_length
+        ).fill_(self.pad)
+        initial_output_tokens.masked_fill_(
+            idx_length[None, :] < length_tgt[:, None], self.unk
+        )
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(encoder_out["encoder_out"][0])
+
+        return DecoderOut(
+            output_tokens=initial_output_tokens,
+            output_scores=initial_output_scores,
+            attn=None,
+            step=0,
+            max_step=0,
+            history=None,
+        )
+
+    def regenerate_length_beam(self, decoder_out, beam_size):
+        output_tokens = decoder_out.output_tokens
+        length_tgt = output_tokens.ne(self.pad).sum(1)
+        length_tgt = (
+            length_tgt[:, None]
+            + utils.new_arange(length_tgt, 1, beam_size)
+            - beam_size // 2
+        )
+        length_tgt = length_tgt.view(-1).clamp_(min=2)
+        max_length = length_tgt.max()
+        idx_length = utils.new_arange(length_tgt, max_length)
+
+        initial_output_tokens = output_tokens.new_zeros(
+            length_tgt.size(0), max_length
+        ).fill_(self.pad)
+        initial_output_tokens.masked_fill_(
+            idx_length[None, :] < length_tgt[:, None], self.unk
+        )
+        initial_output_tokens[:, 0] = self.bos
+        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
+
+        initial_output_scores = initial_output_tokens.new_zeros(
+            *initial_output_tokens.size()
+        ).type_as(decoder_out.output_scores)
+
+        return decoder_out._replace(
+            output_tokens=initial_output_tokens, output_scores=initial_output_scores
+        )
+
+
+class NATransformerDecoder(FairseqNATDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+        self.dictionary = dictionary
+        self.bos = dictionary.bos()
+        self.unk = dictionary.unk()
+        self.eos = dictionary.eos()
+
+        self.encoder_embed_dim = args.encoder_embed_dim
+        self.sg_length_pred = getattr(args, "sg_length_pred", False)
+        self.pred_length_offset = getattr(args, "pred_length_offset", False)
+        self.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+        self.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+        self.embed_length = Embedding(256, self.encoder_embed_dim, None)
+
+    @ensemble_decoder
+    def forward(self, normalize, encoder_out, prev_output_tokens, step=0, **unused):
+        features, _ = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            embedding_copy=(step == 0) & self.src_embedding_copy,
+        )
+        decoder_out = self.output_layer(features)
+        return F.log_softmax(decoder_out, -1) if normalize else decoder_out
+
+    @ensemble_decoder
+    def forward_length(self, normalize, encoder_out):
+        enc_feats = encoder_out["encoder_out"][0]  # T x B x C
+        if len(encoder_out["encoder_padding_mask"]) > 0:
+            src_masks = encoder_out["encoder_padding_mask"][0]  # B x T
+        else:
+            src_masks = None
+        enc_feats = _mean_pooling(enc_feats, src_masks)
+        if self.sg_length_pred:
+            enc_feats = enc_feats.detach()
+        length_out = F.linear(enc_feats, self.embed_length.weight)
+        return F.log_softmax(length_out, -1) if normalize else length_out
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        early_exit=None,
+        embedding_copy=False,
+        **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Inputs:
+            prev_output_tokens: Tensor(B, T)
+            encoder_out: a dictionary of hidden states and masks
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+            the LevenshteinTransformer decoder has full-attention to all generated tokens
+        """
+        # embedding
+        if embedding_copy:
+            src_embd = encoder_out["encoder_embedding"][0]
+            if len(encoder_out["encoder_padding_mask"]) > 0:
+                src_mask = encoder_out["encoder_padding_mask"][0]
+            else:
+                src_mask = None
+            src_mask = (
+                ~src_mask
+                if src_mask is not None
+                else prev_output_tokens.new_ones(*src_embd.size()[:2]).bool()
+            )
+
+            x, decoder_padding_mask = self.forward_embedding(
+                prev_output_tokens,
+                self.forward_copying_source(
+                    src_embd, src_mask, prev_output_tokens.ne(self.padding_idx)
+                ),
+            )
+
+        else:
+
+            x, decoder_padding_mask = self.forward_embedding(prev_output_tokens)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+        inner_states = [x]
+
+        # decoder layers
+        for i, layer in enumerate(self.layers):
+
+            # early exit from the decoder.
+            if (early_exit is not None) and (i >= early_exit):
+                break
+
+            x, attn, _ = layer(
+                x,
+                encoder_out["encoder_out"][0]
+                if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0)
+                else None,
+                encoder_out["encoder_padding_mask"][0]
+                if (
+                    encoder_out is not None
+                    and len(encoder_out["encoder_padding_mask"]) > 0
+                )
+                else None,
+                self_attn_mask=None,
+                self_attn_padding_mask=decoder_padding_mask,
+            )
+            inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def forward_embedding(self, prev_output_tokens, states=None):
+        # embed positions
+        positions = (
+            self.embed_positions(prev_output_tokens)
+            if self.embed_positions is not None
+            else None
+        )
+
+        # embed tokens and positions
+        if states is None:
+            x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+            if self.project_in_dim is not None:
+                x = self.project_in_dim(x)
+        else:
+            x = states
+
+        if positions is not None:
+            x += positions
+        x = self.dropout_module(x)
+        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        return x, decoder_padding_mask
+
+    def forward_copying_source(self, src_embeds, src_masks, tgt_masks):
+        length_sources = src_masks.sum(1)
+        length_targets = tgt_masks.sum(1)
+        mapped_inputs = _uniform_assignment(length_sources, length_targets).masked_fill(
+            ~tgt_masks, 0
+        )
+        copied_embedding = torch.gather(
+            src_embeds,
+            1,
+            mapped_inputs.unsqueeze(-1).expand(
+                *mapped_inputs.size(), src_embeds.size(-1)
+            ),
+        )
+        return copied_embedding
+
+    def forward_length_prediction(self, length_out, encoder_out, tgt_tokens=None):
+        enc_feats = encoder_out["encoder_out"][0]  # T x B x C
+        if len(encoder_out["encoder_padding_mask"]) > 0:
+            src_masks = encoder_out["encoder_padding_mask"][0]  # B x T
+        else:
+            src_masks = None
+        if self.pred_length_offset:
+            if src_masks is None:
+                src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_(
+                    enc_feats.size(0)
+                )
+            else:
+                src_lengs = (~src_masks).transpose(0, 1).type_as(enc_feats).sum(0)
+            src_lengs = src_lengs.long()
+
+        if tgt_tokens is not None:
+            # obtain the length target
+            tgt_lengs = tgt_tokens.ne(self.padding_idx).sum(1).long()
+            if self.pred_length_offset:
+                length_tgt = tgt_lengs - src_lengs + 128
+            else:
+                length_tgt = tgt_lengs
+            length_tgt = length_tgt.clamp(min=0, max=255)
+
+        else:
+            # predict the length target (greedy for now)
+            # TODO: implementing length-beam
+            pred_lengs = length_out.max(-1)[1]
+            if self.pred_length_offset:
+                length_tgt = pred_lengs - 128 + src_lengs
+            else:
+                length_tgt = pred_lengs
+
+        return length_tgt
+
+
+@register_model_architecture(
+    "nonautoregressive_transformer", "nonautoregressive_transformer"
+)
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # --- special arguments ---
+    args.sg_length_pred = getattr(args, "sg_length_pred", False)
+    args.pred_length_offset = getattr(args, "pred_length_offset", False)
+    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
+    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
+
+
+@register_model_architecture(
+    "nonautoregressive_transformer", "nonautoregressive_transformer_wmt_en_de"
+)
+def nonautoregressive_transformer_wmt_en_de(args):
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
new file mode 100644
index 0000000000..4cd723ae96
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .hub_interface import *  # noqa
+from .model import *  # noqa
+from .enc_dec import *  # noqa
+from .model_camembert import *  # noqa
+from .model_gottbert import *  # noqa
+from .model_xlmr import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
new file mode 100644
index 0000000000..ccc7f74cb9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import Counter
+from typing import List
+
+import torch
+
+
+def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
+    """
+    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
+        other_tokens (List[str]): other tokens of shape `(T_words)`
+
+    Returns:
+        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
+    """
+    assert bpe_tokens.dim() == 1
+    assert bpe_tokens[0] == 0
+
+    def clean(text):
+        return text.strip()
+
+    # remove whitespaces to simplify alignment
+    bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
+    bpe_tokens = [
+        clean(roberta.bpe.decode(x) if x not in {"<s>", ""} else x) for x in bpe_tokens
+    ]
+    other_tokens = [clean(str(o)) for o in other_tokens]
+
+    # strip leading <s>
+    bpe_tokens = bpe_tokens[1:]
+    assert "".join(bpe_tokens) == "".join(other_tokens)
+
+    # create alignment from every word to a list of BPE tokens
+    alignment = []
+    bpe_toks = filter(lambda item: item[1] != "", enumerate(bpe_tokens, start=1))
+    j, bpe_tok = next(bpe_toks)
+    for other_tok in other_tokens:
+        bpe_indices = []
+        while True:
+            if other_tok.startswith(bpe_tok):
+                bpe_indices.append(j)
+                other_tok = other_tok[len(bpe_tok) :]
+                try:
+                    j, bpe_tok = next(bpe_toks)
+                except StopIteration:
+                    j, bpe_tok = None, None
+            elif bpe_tok.startswith(other_tok):
+                # other_tok spans multiple BPE tokens
+                bpe_indices.append(j)
+                bpe_tok = bpe_tok[len(other_tok) :]
+                other_tok = ""
+            else:
+                raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
+            if other_tok == "":
+                break
+        assert len(bpe_indices) > 0
+        alignment.append(bpe_indices)
+    assert len(alignment) == len(other_tokens)
+
+    return alignment
+
+
+def align_features_to_words(roberta, features, alignment):
+    """
+    Align given features to words.
+
+    Args:
+        roberta (RobertaHubInterface): RoBERTa instance
+        features (torch.Tensor): features to align of shape `(T_bpe x C)`
+        alignment: alignment between BPE tokens and words returned by
+            func:`align_bpe_to_words`.
+    """
+    assert features.dim() == 2
+
+    bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
+    assert bpe_counts[0] == 0  # <s> shouldn't be aligned
+    denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
+    weighted_features = features / denom.unsqueeze(-1)
+
+    output = [weighted_features[0]]
+    largest_j = -1
+    for bpe_indices in alignment:
+        output.append(weighted_features[bpe_indices].sum(dim=0))
+        largest_j = max(largest_j, *bpe_indices)
+    for j in range(largest_j + 1, len(features)):
+        output.append(weighted_features[j])
+    output = torch.stack(output)
+    assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4)
+    return output
+
+
+def spacy_nlp():
+    if getattr(spacy_nlp, "_nlp", None) is None:
+        try:
+            from spacy.lang.en import English
+
+            spacy_nlp._nlp = English()
+        except ImportError:
+            raise ImportError("Please install spacy with: pip install spacy")
+    return spacy_nlp._nlp
+
+
+def spacy_tokenizer():
+    if getattr(spacy_tokenizer, "_tokenizer", None) is None:
+        try:
+            nlp = spacy_nlp()
+            spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp)
+        except ImportError:
+            raise ImportError("Please install spacy with: pip install spacy")
+    return spacy_tokenizer._tokenizer
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py
new file mode 100644
index 0000000000..e538dee0aa
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py
@@ -0,0 +1,192 @@
+import argparse
+import logging
+
+import torch.nn as nn
+import fairseq.checkpoint_utils
+from fairseq.models import (
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import TransformerDecoder
+from fairseq.models.roberta import model as roberta
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("roberta_enc_dec")
+class RobertaEncDecModel(FairseqEncoderDecoderModel):
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument(
+            "--pretrained-mlm-checkpoint",
+            default=None,
+            type=str,
+            metavar="PRETRAINED",
+            help="path to pretrained mlm checkpoint",
+        )
+        parser.add_argument(
+            "--pretrained-decoder", action="store_true", help="reload decoder"
+        )
+        parser.add_argument(
+            "--hack-layernorm-embedding",
+            action="store_true",
+            help="hack to reload old models trained with encoder-normalize-before=False (no equivalent to encoder-normalize-before=False and layernorm_embedding=False",
+        )
+        parser.add_argument(
+            "--share-decoder-input-output-embed",
+            action="store_true",
+            help="share decoder input and output embeddings",
+        )
+        parser.add_argument(
+            "--share-all-embeddings",
+            action="store_true",
+            help="share encoder, decoder and output embeddings"
+            " (requires shared dictionary and embed dim)",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present
+        base_enc_dec_architecture(args)
+        if args.pretrained_mlm_checkpoint:
+            arg_overrides = None
+            if args.hack_layernorm_embedding:
+                arg_overrides = {"layernorm_embedding": False}
+            loaded = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+                [args.pretrained_mlm_checkpoint], arg_overrides=arg_overrides
+            )
+            ([roberta_enc], _cfg, _task) = loaded
+        else:
+            # Do we need to edit untie_weights here ?
+            share_in_out = (
+                args.share_decoder_input_output_embed or args.share_all_embeddings
+            )
+            args.untie_weights_roberta = not share_in_out
+            if args.hack_layernorm_embedding:
+                args.layernorm_embedding = False
+                args.encoder_normalize_before = False
+            roberta_enc = roberta.RobertaModel.build_model(args, task)
+
+        return cls.from_roberta(roberta_enc, args, task.source_dictionary)
+
+    @staticmethod
+    def from_roberta(roberta_enc: roberta.RobertaModel, args, dictionary):
+        encoder = roberta_enc.encoder.sentence_encoder
+        vocab_size, embed_dim = encoder.embed_tokens.weight.shape
+
+        if args.share_all_embeddings:
+            lm_head = roberta_enc.encoder.lm_head
+            assert encoder.embed_tokens.weight is lm_head.weight, (
+                "Can't use --share-all-embeddings with a model "
+                "that was pretraiend with --untie-weights-roberta_enc"
+            )
+        else:
+            lm_head = roberta.RobertaLMHead(
+                embed_dim, vocab_size, roberta_enc.args.activation_fn
+            )
+
+        dec_embs = nn.Embedding(vocab_size, embed_dim, dictionary.pad())
+        if args.share_all_embeddings or args.share_decoder_input_output_embed:
+            # Note: I wasn't able to use Embedding _weight parameter to achive this sharing.
+            dec_embs.weight = lm_head.weight
+
+        decoder = TransformerDecoder(
+            RobertaEncDecModel.read_args_from_roberta(roberta_enc.args),
+            dictionary,
+            dec_embs,
+            no_encoder_attn=False,
+            output_projection=lm_head,
+        )
+        if getattr(args, "pretrained_decoder", False):
+            decoder_dict = encoder.state_dict()
+
+            # TODO: hide setting "encoder_attn" layers behind a flag.
+            for k, w in list(decoder_dict.items()):
+                if ".self_attn" in k:
+                    k_enc_attn = k.replace(".self_attn", ".encoder_attn")
+                    decoder_dict[k_enc_attn] = w.detach().clone()
+
+            for k, w in lm_head.state_dict().items():
+                decoder_dict["output_projection." + k] = w
+
+            missing_keys, unexpected_keys = decoder.load_state_dict(
+                decoder_dict, strict=False
+            )
+            # missing_keys = [m for m in missing_keys if ".encoder_attn" not in m]
+            assert not missing_keys and not unexpected_keys, (
+                "Failed to load state dict. "
+                f"Missing keys: {missing_keys}. "
+                f"Unexpected keys: {unexpected_keys}."
+            )
+
+        if args.share_all_embeddings:
+            assert decoder.output_projection.weight is decoder.embed_tokens.weight
+            assert encoder.embed_tokens.weight is decoder.embed_tokens.weight
+        elif args.share_decoder_input_output_embed:
+            assert decoder.output_projection.weight is decoder.embed_tokens.weight
+            assert encoder.embed_tokens.weight is not decoder.embed_tokens.weight
+        else:
+            assert decoder.output_projection.weight is not decoder.embed_tokens.weight
+            assert encoder.embed_tokens.weight is not decoder.embed_tokens.weight
+
+        return RobertaEncDecModel(encoder, decoder)
+
+    @staticmethod
+    def read_args_from_roberta(roberta_args: argparse.Namespace):
+        # TODO: this would become easier if encoder/decoder where using a similar
+        # TransformerConfig object
+        args = argparse.Namespace(**vars(roberta_args))
+        attr_map = [
+            ("encoder_attention_heads", "decoder_attention_heads"),
+            ("encoder_embed_dim", "decoder_embed_dim"),
+            ("encoder_embed_dim", "decoder_output_dim"),
+            ("encoder_normalize_before", "decoder_normalize_before"),
+            ("encoder_layers_to_keep", "decoder_layers_to_keep"),
+            ("encoder_ffn_embed_dim", "decoder_ffn_embed_dim"),
+            ("encoder_layerdrop", "decoder_layerdrop"),
+            ("encoder_layers", "decoder_layers"),
+            ("encoder_learned_pos", "decoder_learned_pos"),
+            # should this be set from here ?
+            ("max_positions", "max_target_positions"),
+        ]
+        for k1, k2 in attr_map:
+            setattr(args, k2, getattr(roberta_args, k1))
+
+        args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+        args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+        args.share_decoder_input_output_embed = not roberta_args.untie_weights_roberta
+        return args
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        super().upgrade_state_dict_named(state_dict, name)
+        old_keys = list(state_dict.keys())
+
+        # rename decoder -> encoder before upgrading children modules
+        for k in old_keys:
+            if k.startswith(prefix + "encoder.lm_head"):
+                state_dict.pop(k)
+                continue
+            new_k = k
+            new_k = new_k.replace(".sentence_encoder.", ".")
+            new_k = new_k.replace("decoder.lm_head.", "decoder.output_projection.")
+            if k == new_k:
+                continue
+            # print(k, "->", new_k)
+            state_dict[new_k] = state_dict.pop(k)
+
+
+@register_model_architecture("roberta_enc_dec", "roberta_enc_dec")
+def base_enc_dec_architecture(args):
+    args.hack_layernorm_embedding = getattr(args, "hack_layernorm_embedding", False)
+    args.pretrained_mlm_checkpoint = getattr(args, "pretrained_mlm_checkpoint", None)
+    args.pretrained_decoder = getattr(args, "pretrained_decoder", None)
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+
+    roberta.base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
new file mode 100644
index 0000000000..ba298d63ba
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.data import encoders
+
+
+class RobertaHubInterface(nn.Module):
+    """A simple PyTorch Hub interface to RoBERTa.
+
+    Usage: https://github.com/pytorch/fairseq/tree/main/examples/roberta
+    """
+
+    def __init__(self, cfg, task, model):
+        super().__init__()
+        self.cfg = cfg
+        self.task = task
+        self.model = model
+
+        self.bpe = encoders.build_bpe(cfg.bpe)
+
+        # this is useful for determining the device
+        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
+
+    @property
+    def device(self):
+        return self._float_tensor.device
+
+    def encode(
+        self, sentence: str, *addl_sentences, no_separator=False
+    ) -> torch.LongTensor:
+        """
+        BPE-encode a sentence (or multiple sentences).
+
+        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
+        Every sentence ends with an end-of-sentence (`</s>`) and we use an
+        extra end-of-sentence (`</s>`) as a separator.
+
+        Example (single sentence): `<s> a b c </s>`
+        Example (sentence pair): `<s> d e f </s> </s> 1 2 3 </s>`
+
+        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+        requires leading spaces. For example::
+
+            >>> roberta.encode('Hello world').tolist()
+            [0, 31414, 232, 2]
+            >>> roberta.encode(' world').tolist()
+            [0, 232, 2]
+            >>> roberta.encode('world').tolist()
+            [0, 8331, 2]
+        """
+        bpe_sentence = "<s> " + self.bpe.encode(sentence) + " </s>"
+        for s in addl_sentences:
+            bpe_sentence += " </s>" if not no_separator else ""
+            bpe_sentence += " " + self.bpe.encode(s) + " </s>"
+        tokens = self.task.source_dictionary.encode_line(
+            bpe_sentence, append_eos=False, add_if_not_exist=False
+        )
+        return tokens.long()
+
+    def decode(self, tokens: torch.LongTensor):
+        assert tokens.dim() == 1
+        tokens = tokens.numpy()
+        if tokens[0] == self.task.source_dictionary.bos():
+            tokens = tokens[1:]  # remove <s>
+        eos_mask = tokens == self.task.source_dictionary.eos()
+        doc_mask = eos_mask[1:] & eos_mask[:-1]
+        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
+        sentences = [
+            self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences
+        ]
+        if len(sentences) == 1:
+            return sentences[0]
+        return sentences
+
+    def extract_features(
+        self, tokens: torch.LongTensor, return_all_hiddens: bool = False
+    ) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        if tokens.size(-1) > self.model.max_positions():
+            raise ValueError(
+                "tokens exceeds maximum length: {} > {}".format(
+                    tokens.size(-1), self.model.max_positions()
+                )
+            )
+        features, extra = self.model(
+            tokens.to(device=self.device),
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra["inner_states"]
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
+
+    def register_classification_head(
+        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+    ):
+        self.model.register_classification_head(
+            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+        )
+
+    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
+        features = self.extract_features(tokens.to(device=self.device))
+        logits = self.model.classification_heads[head](features)
+        if return_logits:
+            return logits
+        return F.log_softmax(logits, dim=-1)
+
+    def extract_features_aligned_to_words(
+        self, sentence: str, return_all_hiddens: bool = False
+    ) -> torch.Tensor:
+        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
+        from fairseq.models.roberta import alignment_utils
+        from spacy.tokens import Doc
+
+        nlp = alignment_utils.spacy_nlp()
+        tokenizer = alignment_utils.spacy_tokenizer()
+
+        # tokenize both with GPT-2 BPE and spaCy
+        bpe_toks = self.encode(sentence)
+        spacy_toks = tokenizer(sentence)
+        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
+        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws)
+
+        # extract features and align them
+        features = self.extract_features(
+            bpe_toks, return_all_hiddens=return_all_hiddens
+        )
+        features = features.squeeze(0)
+        aligned_feats = alignment_utils.align_features_to_words(
+            self, features, alignment
+        )
+
+        # wrap in spaCy Doc
+        doc = Doc(
+            nlp.vocab,
+            words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"],
+            spaces=[True]
+            + [x.endswith(" ") for x in spacy_toks_ws[:-1]]
+            + [True, False],
+        )
+        assert len(doc) == aligned_feats.size(0)
+        doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i]
+        return doc
+
+    def fill_mask(self, masked_input: str, topk: int = 5):
+        masked_token = "<mask>"
+        assert (
+            masked_token in masked_input and masked_input.count(masked_token) == 1
+        ), "Please add one {0} token for the input, eg: 'He is a {0} guy'".format(
+            masked_token
+        )
+
+        text_spans = masked_input.split(masked_token)
+        text_spans_bpe = (
+            (" {0} ".format(masked_token))
+            .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans])
+            .strip()
+        )
+        tokens = self.task.source_dictionary.encode_line(
+            "<s> " + text_spans_bpe + " </s>",
+            append_eos=False,
+            add_if_not_exist=False,
+        )
+
+        masked_index = (tokens == self.task.mask_idx).nonzero(as_tuple=False)
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+
+        with utils.model_eval(self.model):
+            features, extra = self.model(
+                tokens.long().to(device=self.device),
+                features_only=False,
+                return_all_hiddens=False,
+            )
+        logits = features[0, masked_index, :].squeeze()
+        prob = logits.softmax(dim=0)
+        values, index = prob.topk(k=topk, dim=0)
+        topk_predicted_token_bpe = self.task.source_dictionary.string(index)
+
+        topk_filled_outputs = []
+        for index, predicted_token_bpe in enumerate(
+            topk_predicted_token_bpe.split(" ")
+        ):
+            predicted_token = self.bpe.decode(predicted_token_bpe)
+            # Quick hack to fix https://github.com/pytorch/fairseq/issues/1306
+            if predicted_token_bpe.startswith("\u2581"):
+                predicted_token = " " + predicted_token
+            if " {0}".format(masked_token) in masked_input:
+                topk_filled_outputs.append(
+                    (
+                        masked_input.replace(
+                            " {0}".format(masked_token), predicted_token
+                        ),
+                        values[index].item(),
+                        predicted_token,
+                    )
+                )
+            else:
+                topk_filled_outputs.append(
+                    (
+                        masked_input.replace(masked_token, predicted_token),
+                        values[index].item(),
+                        predicted_token,
+                    )
+                )
+        return topk_filled_outputs
+
+    def disambiguate_pronoun(self, sentence: str) -> bool:
+        """
+        Usage::
+
+            >>> disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
+            True
+
+            >>> disambiguate_pronoun('The trophy would not fit in the brown suitcase because [it] was too big.')
+            'The trophy'
+        """
+        assert hasattr(
+            self.task, "disambiguate_pronoun"
+        ), "roberta.disambiguate_pronoun() requires a model trained with the WSC task."
+        with utils.model_eval(self.model):
+            return self.task.disambiguate_pronoun(
+                self.model, sentence, use_cuda=self.device.type == "cuda"
+            )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
new file mode 100644
index 0000000000..d7ced9190c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
@@ -0,0 +1,700 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RoBERTa: A Robustly Optimized BERT Pretraining Approach.
+"""
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import DEFAULT_MIN_PARAMS_TO_WRAP, TransformerEncoder
+from fairseq.modules import LayerNorm
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from fairseq.utils import safe_getattr, safe_hasattr
+
+from .hub_interface import RobertaHubInterface
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("roberta")
+class RobertaModel(FairseqEncoderModel):
+    @classmethod
+    def hub_models(cls):
+        return {
+            "roberta.base": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz",
+            "roberta.large": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz",
+            "roberta.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz",
+            "roberta.large.wsc": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz",
+        }
+
+    def __init__(self, args, encoder):
+        super().__init__(encoder)
+        self.args = args
+
+        # We follow BERT's random weight initialization
+        self.apply(init_bert_params)
+
+        self.classification_heads = nn.ModuleDict()
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="L", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="H",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="F",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="A",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--pooler-activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use for pooler layer",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--layernorm-embedding",
+            action="store_true",
+            help="add layernorm to embedding",
+        )
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--activation-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN",
+        )
+        parser.add_argument(
+            "--pooler-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability in the masked_lm pooler layers",
+        )
+        parser.add_argument(
+            "--max-positions", type=int, help="number of positional embeddings to learn"
+        )
+        parser.add_argument(
+            "--load-checkpoint-heads",
+            action="store_true",
+            help="(re-)register and load heads when loading checkpoints",
+        )
+        parser.add_argument(
+            "--untie-weights-roberta",
+            action="store_true",
+            help="Untie weights between embeddings and classifiers in RoBERTa",
+        )
+        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+        parser.add_argument(
+            "--encoder-layerdrop",
+            type=float,
+            metavar="D",
+            default=0,
+            help="LayerDrop probability for encoder",
+        )
+        parser.add_argument(
+            "--encoder-layers-to-keep",
+            default=None,
+            help="which layers to *keep* when pruning as a comma-separated list",
+        )
+        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+        parser.add_argument(
+            "--quant-noise-pq",
+            type=float,
+            metavar="D",
+            default=0,
+            help="iterative PQ quantization noise at training time",
+        )
+        parser.add_argument(
+            "--quant-noise-pq-block-size",
+            type=int,
+            metavar="D",
+            default=8,
+            help="block size of quantization noise at training time",
+        )
+        parser.add_argument(
+            "--quant-noise-scalar",
+            type=float,
+            metavar="D",
+            default=0,
+            help="scalar quantization noise and scalar quantization at training time",
+        )
+        # args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020)
+        parser.add_argument(
+            "--spectral-norm-classification-head",
+            action="store_true",
+            default=False,
+            help="Apply spectral normalization on the classification head",
+        )
+        # args for Fully Sharded Data Parallel (FSDP) training
+        parser.add_argument(
+            "--min-params-to-wrap",
+            type=int,
+            metavar="D",
+            default=DEFAULT_MIN_PARAMS_TO_WRAP,
+            help=(
+                "minimum number of params for a layer to be wrapped with FSDP() when "
+                "training with --ddp-backend=fully_sharded. Smaller values will "
+                "improve memory efficiency, but may make torch.distributed "
+                "communication less efficient due to smaller input sizes. This option "
+                "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
+                "--offload-activations are passed."
+            ),
+        )
+        # args for AdaPruning
+        # In short, it adds regularizarion for the multihead attention module and feed forward neural nets
+        # For more details, please refer to the paper https://openreview.net/forum?id=_CMSV7FTzGI
+        parser.add_argument(
+            "--mha-reg-scale-factor",
+            type=float,
+            metavar="D",
+            default=0.0,
+            help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375",
+        )
+        parser.add_argument(
+            "--ffn-reg-scale-factor",
+            type=float,
+            metavar="D",
+            default=0.0,
+            help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375",
+        )
+        parser.add_argument(
+            "--mha-heads-to-keep",
+            type=int,
+            metavar="D",
+            default=-1,
+            help="number of heads to keep in each multi-head attention module, -1 means keeping all heads",
+        )
+        parser.add_argument(
+            "--ffn-blocks-to-remove",
+            type=int,
+            metavar="D",
+            default=-1,
+            help="number of feedforward blocks to remove in each transformer layer, -1 means keeping all ffn blocks",
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        from omegaconf import OmegaConf
+
+        if OmegaConf.is_config(args):
+            OmegaConf.set_struct(args, False)
+
+        # make sure all arguments are present
+        base_architecture(args)
+
+        if not safe_hasattr(args, "max_positions"):
+            if not safe_hasattr(args, "tokens_per_sample"):
+                args.tokens_per_sample = task.max_positions()
+            args.max_positions = args.tokens_per_sample
+
+        encoder = RobertaEncoder(args, task.source_dictionary)
+
+        if OmegaConf.is_config(args):
+            OmegaConf.set_struct(args, True)
+
+        return cls(args, encoder)
+
+    def forward(
+        self,
+        src_tokens,
+        features_only=False,
+        return_all_hiddens=False,
+        classification_head_name=None,
+        **kwargs,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+
+        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+    def _get_adaptive_head_loss(self):
+        norm_loss = 0
+        scaling = float(self.args.mha_reg_scale_factor)
+        for layer in self.encoder.sentence_encoder.layers:
+            norm_loss_layer = 0
+            for i in range(layer.self_attn.num_heads):
+                start_idx = i * layer.self_attn.head_dim
+                end_idx = (i + 1) * layer.self_attn.head_dim
+                norm_loss_layer += scaling * (
+                    torch.sum(
+                        torch.abs(
+                            layer.self_attn.q_proj.weight[
+                                start_idx:end_idx,
+                            ]
+                        )
+                    )
+                    + torch.sum(
+                        torch.abs(layer.self_attn.q_proj.bias[start_idx:end_idx])
+                    )
+                )
+                norm_loss_layer += scaling * (
+                    torch.sum(
+                        torch.abs(
+                            layer.self_attn.k_proj.weight[
+                                start_idx:end_idx,
+                            ]
+                        )
+                    )
+                    + torch.sum(
+                        torch.abs(layer.self_attn.k_proj.bias[start_idx:end_idx])
+                    )
+                )
+                norm_loss_layer += scaling * (
+                    torch.sum(
+                        torch.abs(
+                            layer.self_attn.v_proj.weight[
+                                start_idx:end_idx,
+                            ]
+                        )
+                    )
+                    + torch.sum(
+                        torch.abs(layer.self_attn.v_proj.bias[start_idx:end_idx])
+                    )
+                )
+
+            norm_loss += norm_loss_layer
+        return norm_loss
+
+    def _get_adaptive_ffn_loss(self):
+        ffn_scale_factor = float(self.args.ffn_reg_scale_factor)
+        filter_loss = 0
+        for layer in self.encoder.sentence_encoder.layers:
+            filter_loss += torch.sum(
+                torch.abs(layer.fc1.weight * ffn_scale_factor)
+            ) + torch.sum(torch.abs(layer.fc2.weight * ffn_scale_factor))
+            filter_loss += torch.sum(
+                torch.abs(layer.fc1.bias * ffn_scale_factor)
+            ) + torch.sum(torch.abs(layer.fc2.bias * ffn_scale_factor))
+        return filter_loss
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        logits = net_output[0].float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1)
+        else:
+            return F.softmax(logits, dim=-1)
+
+    def register_classification_head(
+        self, name, num_classes=None, inner_dim=None, **kwargs
+    ):
+        """Register a classification head."""
+        if name in self.classification_heads:
+            prev_num_classes = self.classification_heads[name].out_proj.out_features
+            prev_inner_dim = self.classification_heads[name].dense.out_features
+            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
+                logger.warning(
+                    're-registering head "{}" with num_classes {} (prev: {}) '
+                    "and inner_dim {} (prev: {})".format(
+                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
+                    )
+                )
+        self.classification_heads[name] = RobertaClassificationHead(
+            input_dim=self.args.encoder_embed_dim,
+            inner_dim=inner_dim or self.args.encoder_embed_dim,
+            num_classes=num_classes,
+            activation_fn=self.args.pooler_activation_fn,
+            pooler_dropout=self.args.pooler_dropout,
+            q_noise=self.args.quant_noise_pq,
+            qn_block_size=self.args.quant_noise_pq_block_size,
+            do_spectral_norm=self.args.spectral_norm_classification_head,
+        )
+
+    @property
+    def supported_targets(self):
+        return {"self"}
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        bpe="gpt2",
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+
+        logger.info(x["args"])
+        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+
+        # rename decoder -> encoder before upgrading children modules
+        for k in list(state_dict.keys()):
+            if k.startswith(prefix + "decoder"):
+                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+        # rename emb_layer_norm -> layernorm_embedding
+        for k in list(state_dict.keys()):
+            if ".emb_layer_norm." in k:
+                new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.")
+                state_dict[new_k] = state_dict[k]
+                del state_dict[k]
+
+        # upgrade children modules
+        super().upgrade_state_dict_named(state_dict, name)
+
+        # Handle new classification heads present in the state dict.
+        current_head_names = (
+            []
+            if not hasattr(self, "classification_heads")
+            else self.classification_heads.keys()
+        )
+        keys_to_delete = []
+        for k in state_dict.keys():
+            if not k.startswith(prefix + "classification_heads."):
+                continue
+
+            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
+            num_classes = state_dict[
+                prefix + "classification_heads." + head_name + ".out_proj.weight"
+            ].size(0)
+            inner_dim = state_dict[
+                prefix + "classification_heads." + head_name + ".dense.weight"
+            ].size(0)
+
+            if getattr(self.args, "load_checkpoint_heads", False):
+                if head_name not in current_head_names:
+                    self.register_classification_head(head_name, num_classes, inner_dim)
+            else:
+                if head_name not in current_head_names:
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "not present in current model: {}".format(head_name, k)
+                    )
+                    keys_to_delete.append(k)
+                elif (
+                    num_classes
+                    != self.classification_heads[head_name].out_proj.out_features
+                    or inner_dim
+                    != self.classification_heads[head_name].dense.out_features
+                ):
+                    logger.warning(
+                        "deleting classification head ({}) from checkpoint "
+                        "with different dimensions than current model: {}".format(
+                            head_name, k
+                        )
+                    )
+                    keys_to_delete.append(k)
+        for k in keys_to_delete:
+            del state_dict[k]
+
+        # Copy any newly-added classification heads into the state dict
+        # with their current weights.
+        if hasattr(self, "classification_heads"):
+            cur_state = self.classification_heads.state_dict()
+            for k, v in cur_state.items():
+                if prefix + "classification_heads." + k not in state_dict:
+                    logger.info("Overwriting " + prefix + "classification_heads." + k)
+                    state_dict[prefix + "classification_heads." + k] = v
+
+            # adapt data2vec models
+            if (
+                "encoder._ema" in state_dict
+                and "encoder.lm_head.weight" not in state_dict
+            ):
+                lm_state = self.encoder.lm_head.state_dict()
+                for k, v in lm_state.items():
+                    state_dict["encoder.lm_head." + k] = v
+
+            for k in list(state_dict.keys()):
+                if k.startswith("encoder.regression_head") or k == "encoder._ema":
+                    del state_dict[k]
+
+
+class RobertaLMHead(nn.Module):
+    """Head for masked language modeling."""
+
+    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, embed_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.layer_norm = LayerNorm(embed_dim)
+
+        if weight is None:
+            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
+        self.weight = weight
+        self.bias = nn.Parameter(torch.zeros(output_dim))
+
+    def forward(self, features, masked_tokens=None, **kwargs):
+        # Only project the masked tokens while training,
+        # saves both memory and computation
+        if masked_tokens is not None:
+            features = features[masked_tokens, :]
+
+        x = self.dense(features)
+        x = self.activation_fn(x)
+        x = self.layer_norm(x)
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight) + self.bias
+        return x
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        activation_fn,
+        pooler_dropout,
+        q_noise=0,
+        qn_block_size=8,
+        do_spectral_norm=False,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = apply_quant_noise_(
+            nn.Linear(inner_dim, num_classes), q_noise, qn_block_size
+        )
+        if do_spectral_norm:
+            if q_noise != 0:
+                raise NotImplementedError(
+                    "Attempting to use Spectral Normalization with Quant Noise. This is not officially supported"
+                )
+            self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.activation_fn(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaEncoder(FairseqEncoder):
+    """RoBERTa encoder."""
+
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+
+        # set any missing default values
+        base_architecture(args)
+        self.args = args
+
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+
+        embed_tokens = self.build_embedding(
+            len(dictionary), args.encoder_embed_dim, dictionary.pad()
+        )
+
+        self.sentence_encoder = self.build_encoder(args, dictionary, embed_tokens)
+
+        self.lm_head = self.build_lm_head(
+            embed_dim=args.encoder_embed_dim,
+            output_dim=len(dictionary),
+            activation_fn=args.activation_fn,
+            weight=(
+                self.sentence_encoder.embed_tokens.weight
+                if not args.untie_weights_roberta
+                else None
+            ),
+        )
+
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_encoder(self, args, dictionary, embed_tokens):
+        encoder = TransformerEncoder(args, dictionary, embed_tokens)
+        encoder.apply(init_bert_params)
+        return encoder
+
+    def build_lm_head(self, embed_dim, output_dim, activation_fn, weight):
+        return RobertaLMHead(embed_dim, output_dim, activation_fn, weight)
+
+    def forward(
+        self,
+        src_tokens,
+        features_only=False,
+        return_all_hiddens=False,
+        masked_tokens=None,
+        **unused,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape
+                `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states'
+                  is a list of hidden states. Note that the hidden
+                  states have shape `(src_len, batch, vocab)`.
+        """
+        x, extra = self.extract_features(
+            src_tokens, return_all_hiddens=return_all_hiddens
+        )
+        if not features_only:
+            x = self.output_layer(x, masked_tokens=masked_tokens)
+        return x, extra
+
+    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
+        encoder_out = self.sentence_encoder(
+            src_tokens,
+            return_all_hiddens=return_all_hiddens,
+            token_embeddings=kwargs.get("token_embeddings", None),
+        )
+        # T x B x C -> B x T x C
+        features = encoder_out["encoder_out"][0].transpose(0, 1)
+        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
+        return features, {"inner_states": inner_states}
+
+    def output_layer(self, features, masked_tokens=None, **unused):
+        return self.lm_head(features, masked_tokens)
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+
+@register_model_architecture("roberta", "roberta")
+def base_architecture(args):
+    args.encoder_layers = safe_getattr(args, "encoder_layers", 12)
+    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 3072)
+    args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 12)
+
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = safe_getattr(args, "activation_dropout", 0.0)
+    args.pooler_dropout = safe_getattr(args, "pooler_dropout", 0.0)
+
+    args.max_source_positions = safe_getattr(args, "max_positions", 512)
+    args.no_token_positional_embeddings = safe_getattr(
+        args, "no_token_positional_embeddings", False
+    )
+
+    # BERT has a few structural differences compared to the original Transformer
+    args.encoder_learned_pos = safe_getattr(args, "encoder_learned_pos", True)
+    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", True)
+    args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", True)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    args.encoder_normalize_before = safe_getattr(
+        args, "encoder_normalize_before", False
+    )
+    args.pooler_activation_fn = safe_getattr(args, "pooler_activation_fn", "tanh")
+    args.untie_weights_roberta = safe_getattr(args, "untie_weights_roberta", False)
+
+    # Adaptive input config
+    args.adaptive_input = safe_getattr(args, "adaptive_input", False)
+
+    # LayerDrop config
+    args.encoder_layerdrop = safe_getattr(args, "encoder_layerdrop", 0.0)
+    args.encoder_layers_to_keep = safe_getattr(args, "encoder_layers_to_keep", None)
+
+    # Quantization noise config
+    args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0)
+    args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8)
+    args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0)
+
+    # R4F config
+    args.spectral_norm_classification_head = safe_getattr(
+        args, "spectral_norm_classification_head", False
+    )
+
+
+@register_model_architecture("roberta", "roberta_prenorm")
+def roberta_prenorm_architecture(args):
+    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False)
+    args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", True)
+    base_architecture(args)
+
+
+@register_model_architecture("roberta", "roberta_base")
+def roberta_base_architecture(args):
+    base_architecture(args)
+
+
+@register_model_architecture("roberta", "roberta_large")
+def roberta_large_architecture(args):
+    args.encoder_layers = safe_getattr(args, "encoder_layers", 24)
+    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16)
+    base_architecture(args)
+
+
+@register_model_architecture("roberta", "xlm")
+def xlm_architecture(args):
+    args.encoder_layers = safe_getattr(args, "encoder_layers", 16)
+    args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1280)
+    args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 1280 * 4)
+    args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16)
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
new file mode 100644
index 0000000000..46447546fa
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+CamemBERT: a Tasty French Language Model
+"""
+
+from fairseq.models import register_model
+
+from .hub_interface import RobertaHubInterface
+from .model import RobertaModel
+
+
+@register_model("camembert")
+class CamembertModel(RobertaModel):
+    @classmethod
+    def hub_models(cls):
+        return {
+            "camembert": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz",
+            "camembert.v0": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz",
+            "camembert-base": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz",
+            "camembert-large": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz",
+            "camembert-base-ccnet": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz",
+            "camembert-base-ccnet-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz",
+            "camembert-base-wikipedia-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz",
+            "camembert-base-oscar-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz",
+        }
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        bpe="sentencepiece",
+        **kwargs
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
new file mode 100644
index 0000000000..dc7a019b33
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+GottBERT: a pure German Language Model
+"""
+
+from fairseq.models import register_model
+
+from .hub_interface import RobertaHubInterface
+from .model import RobertaModel
+
+
+@register_model("gottbert")
+class GottbertModel(RobertaModel):
+    @classmethod
+    def hub_models(cls):
+        return {
+            "gottbert-base": "https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz",
+        }
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        bpe="hf_byte_bpe",
+        bpe_vocab="vocab.json",
+        bpe_merges="merges.txt",
+        bpe_add_prefix_space=False,
+        **kwargs
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            bpe_vocab=bpe_vocab,
+            bpe_merges=bpe_merges,
+            bpe_add_prefix_space=bpe_add_prefix_space,
+            **kwargs,
+        )
+        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
new file mode 100644
index 0000000000..cf6e354d53
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Unsupervised Cross-lingual Representation Learning at Scale
+"""
+
+from fairseq.models import register_model
+
+from .hub_interface import RobertaHubInterface
+from .model import RobertaModel
+
+
+@register_model("xlmr")
+class XLMRModel(RobertaModel):
+    @classmethod
+    def hub_models(cls):
+        return {
+            "xlmr.base": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz",
+            "xlmr.large": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz",
+            "xlmr.xl": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xl.tar.gz",
+            "xlmr.xxl": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xxl.tar.gz",
+        }
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        bpe="sentencepiece",
+        **kwargs
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
new file mode 100644
index 0000000000..6ea914d6a5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .speech_dlm import *  # noqa
+from .hub_interface import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
new file mode 100644
index 0000000000..11bc0f50bb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+from typing import Any, Dict, Iterator, List
+
+import torch
+from fairseq import utils
+from omegaconf import open_dict
+from torch import nn
+
+from tqdm import tqdm
+
+from fairseq.hub_utils import GeneratorHubInterface
+
+
+logger = logging.getLogger(__name__)
+
+
+class MultichannelGeneratorHubInterface(GeneratorHubInterface):
+    """Pytorch Hub interface for generating sequences from a pre-trained
+    multichannel language model.
+    """
+
+    def __init__(self, cfg, task, models):
+        super().__init__(cfg, task, models)
+        self.cfg = cfg
+        self.task = task
+        self.models = nn.ModuleList(models)
+        self.src_dicts = task.source_dictionaries
+        self.tgt_dicts = task.target_dictionaries
+        self.channels = task.channels
+
+        # optimize model for generation
+        for model in self.models:
+            model.prepare_for_inference_(cfg)
+
+    def sample(
+        self,
+        sentences: List[Dict[str, str]],
+        beam: int = 1,
+        verbose: bool = False,
+        **kwargs
+    ) -> List[str]:
+        if isinstance(sentences, dict):
+            return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0]
+        tokenized_sentences = [self.encode(sentence) for sentence in sentences]
+        batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)
+        return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos]
+
+    def score(self, sentences: List[Dict[str, str]], **kwargs):
+        raise NotImplementedError(
+            "MultichannelGeneratorHubInterface doesn't support score() method"
+        )
+
+    def generate(
+        self,
+        tokenized_sentences: List[Dict[str, torch.LongTensor]],
+        beam: int = 5,
+        verbose: bool = False,
+        skip_invalid_size_inputs=False,
+        inference_step_args=None,
+        **kwargs
+    ) -> List[List[Dict[str, torch.Tensor]]]:
+        if isinstance(tokenized_sentences, dict):
+            return self.generate(
+                [tokenized_sentences], beam=beam, verbose=verbose, **kwargs
+            )[0]
+
+        # build generator using current args as well as any kwargs
+        gen_args = copy.deepcopy(self.cfg.generation)
+        with open_dict(gen_args):
+            gen_args.beam = beam
+            for k, v in kwargs.items():
+                setattr(gen_args, k, v)
+        generator = self.task.build_generator(self.models, gen_args)
+
+        inference_step_args = inference_step_args or {}
+        results = []
+        for batch in tqdm(
+            self._build_batches(tokenized_sentences, skip_invalid_size_inputs)
+        ):
+            batch = utils.apply_to_sample(lambda t: t.to(self.device), batch)
+            translations = self.task.inference_step(
+                generator, self.models, batch, **inference_step_args
+            )
+            for id, hypos in zip(batch["id"].tolist(), translations):
+                # The output of the generator is supposed to be a tensor of size (bsz x max_len x n_channels)
+                # So we need to convert it to dictionary form
+                for i in range(len(hypos)):
+                    hypos[i]["tokens"] = {
+                        channel: hypos[i]["tokens"][..., j]
+                        for j, channel in enumerate(self.channels)
+                    }
+                results.append((id, hypos))
+
+        # sort output to match input order
+        outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])]
+
+        if verbose:
+
+            def getarg(name, default):
+                return getattr(gen_args, name, getattr(self.cfg, name, default))
+
+            for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs):
+                src_str_with_unk = {
+                    channel: self.string(source_tokens[channel], channel)
+                    for channel in source_tokens
+                }
+                logger.info("S\t{}".format(src_str_with_unk))
+                for hypo in target_hypotheses:
+                    hypo_str = self.decode(hypo["tokens"])
+                    logger.info("H\t{}\t{}".format(hypo["score"], hypo_str))
+                    # hypo["positional_scores"]: T x n_channels
+                    pos_scores = {}
+                    for c, channel in enumerate(source_tokens):
+                        pos_scores[channel] = " ".join(
+                            map(
+                                lambda x: "{:.4f}".format(x),
+                                hypo["positional_scores"][:, c].tolist(),
+                            )
+                        )
+                    logger.info("P\t{}".format(pos_scores))
+
+        return outputs
+
+    def encode(self, sentence: Dict[str, str]) -> Dict[str, torch.LongTensor]:
+        assert isinstance(
+            sentence, dict
+        ), "Input sentence is expected to be a dictionary over channels"
+        assert set(sentence.keys()) == set(
+            self.channels
+        ), "Mismatch between input sentence keys and model channels ({} vs {})".format(
+            set(sentence.keys()), set(self.channels)
+        )
+        encoded_sentence = {}
+        for channel in sentence:
+            sentence_channel = sentence[channel]
+            sentence_channel = self.tokenize(sentence_channel)
+            sentence_channel = self.apply_bpe(sentence_channel)
+            sentence_channel = self.binarize(sentence_channel, channel)
+            encoded_sentence[channel] = sentence_channel
+        sentence_size = encoded_sentence[self.channels[0]].size()
+        assert all(
+            encoded_sentence[channel].size() == sentence_size
+            for channel in encoded_sentence
+        ), "Input tensors are expected to have the same size in all channels"
+        return encoded_sentence
+
+    def decode(self, tokens: Dict[str, torch.LongTensor]) -> Dict[str, str]:
+        assert isinstance(
+            tokens, dict
+        ), "Input tokens are expected to be a dictionary over channels"
+        assert set(tokens.keys()) == set(
+            self.channels
+        ), "Mismatch between input tokens keys and model channels ({} vs {})".format(
+            set(tokens.keys()), set(self.channels)
+        )
+        decoded_sentence = {}
+        for channel in tokens:
+            tokens_channel = tokens[channel]
+            sentence_channel = self.string(tokens_channel, channel)
+            sentence_channel = self.remove_bpe(sentence_channel)
+            sentence_channel = self.detokenize(sentence_channel)
+            decoded_sentence[channel] = sentence_channel
+        return decoded_sentence
+
+    def binarize(self, sentence: str, channel: str) -> torch.LongTensor:
+        return (
+            self.src_dicts[channel].encode_line(sentence, add_if_not_exist=False).long()
+        )
+
+    def string(self, tokens: torch.LongTensor, channel: str) -> str:
+        return self.tgt_dicts[channel].string(tokens)
+
+    def _build_batches(
+        self, tokens: List[Dict[str, List[int]]], skip_invalid_size_inputs: bool
+    ) -> Iterator[Dict[str, Any]]:
+        lengths = torch.LongTensor([next(iter(d.values())).numel() for d in tokens])
+        batch_iterator = self.task.get_batch_iterator(
+            dataset=self.task.build_dataset_for_inference(tokens, lengths),
+            max_tokens=self.cfg.dataset.max_tokens,
+            max_sentences=self.cfg.dataset.batch_size,
+            max_positions=self.max_positions,
+            ignore_invalid_inputs=skip_invalid_size_inputs,
+            disable_iterator_cache=True,
+        ).next_epoch_itr(shuffle=False)
+        return batch_iterator
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
new file mode 100644
index 0000000000..a14a1d64a8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
@@ -0,0 +1,572 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.models import FairseqIncrementalDecoder
+from fairseq.modules import (
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    PositionalEmbedding,
+)
+from .speech_dlm_decoder_layer import (
+    CrossChannelTransformerDecoderLayer,
+    StandardTransformerDecoderLayer,
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+from torch import Tensor
+
+
+class CrossChannelTransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Cross-channel Transformer Decoder Block for parallel spoken dialogue units
+    as described in the paper: https://arxiv.org/pdf/2203.16502.pdf;
+    consisting of *args.decoder_layers* layers. Each layer is a
+    :class:`StandardTransformerDecoderLayer` or
+    :class:`CrossChannelTransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        channels (list): list of channel names (string)
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(self, args, dictionary, embed_tokens, channels, no_encoder_attn=False):
+        self.args = args
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        self._future_mask = torch.empty(0)
+
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.decoder_layerdrop = args.decoder_layerdrop
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+        self.channels = channels
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        self.embed_dim = embed_dim
+        self.output_embed_dim = args.decoder_output_dim
+
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+
+        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
+
+        if args.quant_noise_pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                args.quant_noise_pq,
+                args.quant_noise_pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        self.project_in_dim = (
+            nn.Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+        self.embed_positions = (
+            PositionalEmbedding(
+                self.max_target_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=args.decoder_learned_pos,
+            )
+            if not args.no_token_positional_embeddings
+            else None
+        )
+
+        if getattr(args, "layernorm_embedding", False):
+            self.layernorm_embedding = LayerNorm(embed_dim)
+        else:
+            self.layernorm_embedding = None
+
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+        assert 0 <= args.decoder_cross_layers <= args.decoder_layers, (
+            "The number of cross-channel attention decoder layers must be non-negative"
+            f"and not exceeds the number of decoder layers (found {args.decoder_cross_layers})"
+        )
+
+        if self.decoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                self.build_decoder_layer(args, no_encoder_attn)
+                if i < args.decoder_layers - args.decoder_cross_layers
+                else self.build_cross_decoder_layer(args, no_encoder_attn)
+                for i in range(args.decoder_layers)
+            ]
+        )
+        self.num_layers = len(self.layers)
+        self.non_cross_layers = args.decoder_layers - args.decoder_cross_layers
+
+        if args.decoder_normalize_before and not getattr(
+            args, "no_decoder_final_norm", False
+        ):
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+        self.project_out_dim = (
+            nn.Linear(embed_dim, self.output_embed_dim, bias=False)
+            if embed_dim != self.output_embed_dim
+            else None
+        )
+
+        self.output_projection = None
+        self.is_cross_prediction = bool(
+            float(args.main_and_cross_weights.split(",")[1]) != 0
+        )
+        self.n_output_projections = (
+            1 if not self.is_cross_prediction else len(self.channels)
+        )
+
+        if self.share_input_output_embed:
+            # Output projection is a list of projections
+            # where the first proj is for the main-channel,
+            # then roll in a cicular way.
+            # For example: if the main channel has index i
+            # the second proj is for channel i+1 (mod N_channels), etc.
+            self.output_projection = nn.ModuleList(
+                [
+                    nn.Linear(
+                        embed_tokens.weight.shape[1],  # embed_dim
+                        embed_tokens.weight.shape[0],  # n_dictionaries
+                        bias=False,
+                    )
+                    for _ in range(self.n_output_projections)
+                ]
+            )
+            # Only share the main-channel projection
+            self.output_projection[0].weight = embed_tokens.weight
+            for i in range(1, self.n_output_projections):
+                nn.init.normal_(
+                    self.output_projection[i].weight,
+                    mean=0,
+                    std=embed_tokens.weight.shape[1] ** -0.5,
+                )
+        else:
+            self.output_projection = nn.ModuleList(
+                [
+                    nn.Linear(self.output_embed_dim, len(dictionary), bias=False)
+                    for _ in range(self.n_output_projections)
+                ]
+            )
+            for i in range(self.n_output_projections):
+                nn.init.normal_(
+                    self.output_projection[i].weight,
+                    mean=0,
+                    std=self.output_embed_dim**-0.5,
+                )
+        self.output_duration_prediction = (
+            None
+            if str(args.duration_prediction).lower() == "false"
+            else nn.ModuleList(
+                [
+                    nn.Linear(self.output_embed_dim, 1)
+                    for _ in range(self.n_output_projections)
+                ]
+            )
+        )
+
+    def build_decoder_layer(self, args, no_encoder_attn=False):
+        layer = StandardTransformerDecoderLayer(args, no_encoder_attn)
+        if getattr(args, "checkpoint_activations", False):
+            offload_to_cpu = getattr(args, "offload_activations", False)
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        return layer
+
+    def build_cross_decoder_layer(self, args, no_encoder_attn=False):
+        layer = CrossChannelTransformerDecoderLayer(args, no_encoder_attn)
+        if getattr(args, "checkpoint_activations", False):
+            offload_to_cpu = getattr(args, "offload_activations", False)
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        return layer
+
+    def forward(
+        self,
+        prev_output_tokens: Dict[str, Tensor],
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[
+            List[Dict[str, Dict[str, Optional[Tensor]]]]
+        ] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        # return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (dict[str, LongTensor]): previous decoder outputs,
+                dictionary over all channels with the values being the tensors
+                of shape `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): list of dictionaries used for storing state
+                during :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output, dict over channels of tensors
+                    of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self,
+        prev_output_tokens: Dict[str, Tensor],
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[
+            List[Dict[str, Dict[str, Optional[Tensor]]]]
+        ] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        return self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+
+    """
+    A scriptable subclass of this class has an extract_features method and calls
+    super().extract_features, but super() is not supported in torchscript. A copy of
+    this function is made to be used in the subclass instead.
+    """
+
+    def extract_features_scriptable(
+        self,
+        prev_output_tokens: Dict[str, Tensor],
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[
+            List[Dict[str, Dict[str, Optional[Tensor]]]]
+        ] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        The core function of *forward* but only return features.
+
+        The input (prev_output_tokens) is a dictionary over all channels,
+        expected to have the following form:
+            {
+                'channel1' : Tensor((batch x tgt_len)),
+                'channel2' : Tensor((batch x tgt_len)),
+            }
+
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+
+        Returns:
+            tuple:
+                - the decoder's features, dict over channels of tensors
+                    of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+
+        x_list = []
+        for i, channel in enumerate(self.channels):
+            # embed positions
+            positions = None
+            if self.embed_positions is not None:
+                positions = self.embed_positions(
+                    prev_output_tokens[channel],
+                    incremental_state=incremental_state[i]
+                    if incremental_state is not None
+                    else None,
+                )
+
+            if incremental_state is not None:
+                prev_output_tokens[channel] = prev_output_tokens[channel][:, -1:]
+                if positions is not None:
+                    positions = positions[:, -1:]
+
+            # embed tokens and positions
+            x = self.embed_tokens(prev_output_tokens[channel])
+
+            if self.project_in_dim is not None:
+                x = self.project_in_dim(x)
+
+            x = self.embed_scale * x
+
+            if self.quant_noise is not None:
+                x = self.quant_noise(x)
+
+            if positions is not None:
+                x += positions
+
+            if self.layernorm_embedding is not None:
+                x = self.layernorm_embedding(x)
+
+            x = self.dropout_module(x)
+
+            # B x T x C -> T x B x C
+            x = x.transpose(0, 1)
+
+            x_list.append(x)
+
+        self_attn_padding_mask: Optional[Tensor] = None
+        if (
+            self.cross_self_attention
+            or prev_output_tokens[self.channels[0]].eq(self.padding_idx).any()
+        ):
+            self_attn_padding_mask = prev_output_tokens[self.channels[0]].eq(
+                self.padding_idx
+            )
+
+        # decoder layers
+        attn: Optional[Dict[Tensor]] = None
+        inner_states: List[Optional[Dict[str, Tensor]]] = [
+            {channel: x_list[i] for i, channel in enumerate(self.channels)}
+        ]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x_list[0])
+            else:
+                self_attn_mask = None
+
+            # need to change to tensor for the checkpoint activation to work
+            if isinstance(x_list, list):
+                x_list = torch.stack(x_list)
+            x_list, layer_attn_list, _ = layer(
+                x_list,
+                encoder_out["encoder_out"][0]
+                if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0)
+                else None,
+                encoder_out["encoder_padding_mask"][0]
+                if (
+                    encoder_out is not None
+                    and len(encoder_out["encoder_padding_mask"]) > 0
+                )
+                else None,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+
+            inner_states.append(
+                {channel: x_list[i] for i, channel in enumerate(self.channels)}
+            )
+            if idx == alignment_layer and all(
+                layer_attn is not None for layer_attn in layer_attn_list
+            ):
+                attn = {
+                    channel: layer_attn_list[i].float().to(x_list[0])
+                    for i, channel in enumerate(self.channels)
+                }
+        # change back from tensor to list
+        if not isinstance(x_list, list):
+            x_list = list(torch.unbind(x_list))
+
+        if attn is not None:
+            for channel in attn:
+                if alignment_heads is not None:
+                    attn[channel] = attn[channel][:alignment_heads]
+
+                # average probabilities over heads
+                attn[channel] = attn[channel].mean(dim=0)
+
+        for i, x in enumerate(x_list):
+            if self.layer_norm is not None:
+                x = self.layer_norm(x)
+
+            # T x B x C -> B x T x C
+            x = x.transpose(0, 1)
+
+            if self.project_out_dim is not None:
+                x = self.project_out_dim(x)
+
+            x_list[i] = x
+
+        x = {channel: x_list[i] for i, channel in enumerate(self.channels)}
+
+        return x, {"attn": [attn], "inner_states": inner_states}
+
+    def output_layer(self, features):
+        """Project features to the vocabulary size.
+        Return a dictionary of the form:
+            {
+                'input-channel': {
+                    'predicted-channel': token prediction tensor of shape `(batch, tgt_len, vocab)`,
+                }
+            }
+
+        if duration_prediction is enabled
+            {
+                'input-channel': {
+                    'predicted-channel': {
+                        'pred_token': token prediction tensor of shape `(batch, tgt_len, vocab)`,
+                        'pred_duration': duration prediction tensor
+                    }
+                }
+            }
+        """
+        # project back to size of vocabulary
+        if self.output_duration_prediction is None:
+            if self.is_cross_prediction:
+                return {
+                    channel: {
+                        pred_channel: self.output_projection[j - i](features[channel])
+                        for j, pred_channel in enumerate(self.channels)
+                    }
+                    for i, channel in enumerate(self.channels)
+                }
+            else:
+                return {
+                    channel: {channel: self.output_projection[0](features[channel])}
+                    for i, channel in enumerate(self.channels)
+                }
+        else:
+            if self.is_cross_prediction:
+                return {
+                    channel: {
+                        pred_channel: {
+                            "pred_token": self.output_projection[j - i](
+                                features[channel]
+                            ),
+                            "pred_duration": self.output_duration_prediction[j - i](
+                                features[channel]
+                            ),
+                        }
+                        for j, pred_channel in enumerate(self.channels)
+                    }
+                    for i, channel in enumerate(self.channels)
+                }
+            else:
+                return {
+                    channel: {
+                        channel: {
+                            "pred_token": self.output_projection[0](features[channel]),
+                            "pred_duration": self.output_duration_prediction[0](
+                                features[channel]
+                            ),
+                        }
+                    }
+                    for i, channel in enumerate(self.channels)
+                }
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
+        if (
+            self._future_mask.size(0) == 0
+            or (not self._future_mask.device == tensor.device)
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
+            )
+        self._future_mask = self._future_mask.to(tensor)
+        return self._future_mask[:dim, :dim]
+
+    def get_normalized_probs_scriptable(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        logits_dict = net_output[0]
+        out_dict = {}
+        for channel in logits_dict:
+            out_dict[channel] = {}
+            for pred_channel in logits_dict[channel]:
+                if isinstance(logits_dict[channel][pred_channel], dict):
+                    pred_token_logits = logits_dict[channel][pred_channel]["pred_token"]
+                else:
+                    pred_token_logits = logits_dict[channel][pred_channel]
+                if log_probs:
+                    out = utils.log_softmax(
+                        pred_token_logits, dim=-1, onnx_trace=self.onnx_trace
+                    )
+                else:
+                    out = utils.softmax(
+                        pred_token_logits, dim=-1, onnx_trace=self.onnx_trace
+                    )
+                if isinstance(logits_dict[channel][pred_channel], dict):
+                    out_dict[channel][pred_channel] = {
+                        "pred_token": out,
+                        "pred_duration": logits_dict[channel][pred_channel][
+                            "pred_duration"
+                        ].float(),
+                    }  # move to float32 to avoid inf loss
+                else:
+                    out_dict[channel][pred_channel] = out
+        return out_dict
+
+    def reorder_incremental_state_scripting(
+        self,
+        incremental_state: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order: Tensor,
+    ):
+        """Main entry point for reordering the incremental state.
+
+        Due to limitations in TorchScript, we call this function in
+        :class:`fairseq.sequence_generator.SequenceGenerator` instead of
+        calling :func:`reorder_incremental_state` directly.
+        """
+        for module in self.modules():
+            if hasattr(module, "reorder_incremental_state"):
+                for i, incremental_state_channel in enumerate(incremental_state):
+                    result = module.reorder_incremental_state(
+                        incremental_state_channel, new_order
+                    )
+                    if result is not None:
+                        incremental_state[i] = result
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
new file mode 100644
index 0000000000..fb65fdf810
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
@@ -0,0 +1,717 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Tuple, Optional
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor
+
+
+class CrossChannelTransformerDecoderLayer(nn.Module):
+    """Cross-Attention Transformer Decoder Layer block as described
+    in the paper: https://arxiv.org/pdf/2203.16502.pdf
+
+    Composed of a Multi-head Self Attention block followed by a
+    Multi-head Cross-Attention block which attends to the self-attention
+    outputs of the other channels. The weights of the attention blocks
+    in all channels are shared.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.quant_noise = getattr(args, "quant_noise_pq", 0)
+        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+
+        # This cross_self_attention is used for encoder-decoder systems,
+        # It's not the cross-channel attention (defined below as cross_channel_attn)
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+        self.self_attn = self.build_self_attention(
+            self.embed_dim,
+            args,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+        self.cross_channel_attn = self.build_cross_channel_attention(
+            self.embed_dim,
+            args,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+
+        self.activation_fn = utils.get_activation_fn(
+            activation=str(args.activation_fn)
+            if getattr(args, "activation_fn", None) is not None
+            else "relu"
+        )
+        activation_dropout_p = getattr(args, "activation_dropout", 0) or 0
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            activation_dropout_p = getattr(args, "relu_dropout", 0) or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.decoder_normalize_before
+
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, "char_inputs", False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.cross_channel_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            args.decoder_ffn_embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            args.decoder_ffn_embed_dim,
+            self.embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+
+        self.onnx_trace = False
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_self_attention(
+        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=not getattr(args, "cross_self_attention", False),
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def build_cross_channel_attention(
+        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=False,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def build_encoder_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            kdim=getattr(args, "encoder_embed_dim", None),
+            vdim=getattr(args, "encoder_embed_dim", None),
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def residual_connection(self, x, residual):
+        return residual + x
+
+    def forward(
+        self,
+        x_list_tensor: List[torch.Tensor],
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        incremental_state: Optional[
+            List[Dict[str, Dict[str, Optional[Tensor]]]]
+        ] = None,
+        prev_self_attn_state: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+    ):
+        """
+        Args:
+            x_list_tensor (List[Tensor]): list of input tensors in different channels,
+                each tensor is of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            incremental_state (optional): list of incremental_state dictionaries over
+                different channels (sequence generation mode)
+            prev_self_attn_state (List[Tuple[Tensor, Tensor]], optional): list of tuples
+                (self_attn_state, cross_channel_attn_state) over different channels
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+
+        Returns:
+            list of encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        n_channels = len(x_list_tensor)
+        if need_head_weights:
+            need_attn = True
+
+        # incremental_state is a list of dictionaries over different channels
+        if incremental_state is not None:
+            assert isinstance(incremental_state, list)
+            assert len(incremental_state) == n_channels
+
+        # prev_self_attn_state is a list of tuples (self_attn_state, cross_channel_attn_state) over different channels
+        if prev_self_attn_state is not None:
+            assert isinstance(prev_self_attn_state, list)
+            assert len(prev_self_attn_state) == n_channels
+            for prev_self_attn_state_channel in prev_self_attn_state:
+                assert isinstance(prev_self_attn_state_channel, tuple)
+                assert len(prev_self_attn_state_channel) == 2
+
+        # Backup for other channels & cross channel attention
+        self_attn_mask_orin = self_attn_mask
+        self_attn_padding_mask_orin = self_attn_padding_mask
+
+        x_list = []
+        attn_list = []
+        for i, x in enumerate(x_list_tensor):
+            residual = x
+
+            if self.normalize_before:
+                x = self.self_attn_layer_norm(x)
+
+            if prev_self_attn_state is not None:
+                prev_key, prev_value = prev_self_attn_state[i][0][:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_self_attn_state[i][0]) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_self_attn_state[i][0][2]
+                assert incremental_state is not None
+                self.self_attn._set_input_buffer(incremental_state[i], saved_state)
+            _self_attn_input_buffer = self.self_attn._get_input_buffer(
+                incremental_state[i] if incremental_state is not None else None
+            )
+            if self.cross_self_attention and not (
+                incremental_state is not None
+                and _self_attn_input_buffer is not None
+                and "prev_key" in _self_attn_input_buffer
+            ):
+                if self_attn_mask_orin is not None:
+                    assert encoder_out is not None
+                    self_attn_mask = torch.cat(
+                        (
+                            x.new_zeros(x.size(0), encoder_out.size(0)),
+                            self_attn_mask_orin,
+                        ),
+                        dim=1,
+                    )
+                if self_attn_padding_mask_orin is not None:
+                    if encoder_padding_mask is None:
+                        assert encoder_out is not None
+                        encoder_padding_mask = self_attn_padding_mask_orin.new_zeros(
+                            encoder_out.size(1), encoder_out.size(0)
+                        )
+                    self_attn_padding_mask = torch.cat(
+                        (encoder_padding_mask, self_attn_padding_mask_orin), dim=1
+                    )
+                assert encoder_out is not None
+                y = torch.cat((encoder_out, x), dim=0)
+            else:
+                y = x
+
+            x, attn = self.self_attn(
+                query=x,
+                key=y,
+                value=y,
+                key_padding_mask=self_attn_padding_mask,
+                incremental_state=incremental_state[i]
+                if incremental_state is not None
+                else None,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+            )
+
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.self_attn_layer_norm(x)
+
+            if self.encoder_attn is not None and encoder_out is not None:
+                residual = x
+                if self.normalize_before:
+                    x = self.encoder_attn_layer_norm(x)
+                if prev_attn_state is not None:
+                    prev_key, prev_value = prev_attn_state[:2]
+                    saved_state: Dict[str, Optional[Tensor]] = {
+                        "prev_key": prev_key,
+                        "prev_value": prev_value,
+                    }
+                    if len(prev_attn_state) >= 3:
+                        saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                    assert incremental_state is not None
+                    self.encoder_attn._set_input_buffer(
+                        incremental_state[i], saved_state
+                    )
+
+                x, attn = self.encoder_attn(
+                    query=x,
+                    key=encoder_out,
+                    value=encoder_out,
+                    key_padding_mask=encoder_padding_mask,
+                    incremental_state=incremental_state[i]
+                    if incremental_state is not None
+                    else None,
+                    static_kv=True,
+                    need_weights=need_attn or (not self.training and self.need_attn),
+                    need_head_weights=need_head_weights,
+                )
+                x = self.dropout_module(x)
+                x = self.residual_connection(x, residual)
+                if not self.normalize_before:
+                    x = self.encoder_attn_layer_norm(x)
+
+            x_list.append(x)
+            attn_list.append(attn)
+
+        # Store attentions & new x(s) (bc the old x(s) are used in other channels)
+        x_list_new = []
+        # Here comes the cross channel attention
+        for i, x in enumerate(x_list):
+            residual = x
+            if self.normalize_before:
+                x = self.cross_channel_attn_layer_norm(x)
+
+            if prev_self_attn_state is not None:
+                prev_key, prev_value = prev_self_attn_state[i][1][:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_self_attn_state[i][1]) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_self_attn_state[i][1][2]
+                assert incremental_state is not None
+                self.cross_channel_attn._set_input_buffer(
+                    incremental_state[i], saved_state
+                )
+
+            # The cross attention is computed with the concatenation of attentions from other channels
+            if len(x_list) > 1:
+                x_other = torch.cat(
+                    [x_list[(i + j) % len(x_list)] for j in range(1, len(x_list))],
+                    dim=0,
+                )
+            else:
+                # Self-attention when having only one channel
+                x_other = x_list[i]
+
+            x, attn = self.cross_channel_attn(
+                query=x,
+                key=x_other,
+                value=x_other,
+                key_padding_mask=self_attn_padding_mask_orin,
+                incremental_state=incremental_state[i]
+                if incremental_state is not None
+                else None,
+                need_weights=False,
+                attn_mask=self_attn_mask_orin,
+            )
+
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.cross_channel_attn_layer_norm(x)
+
+            x_list_new.append(x)
+        x_list = x_list_new
+
+        for i, x in enumerate(x_list):
+            residual = x
+            if self.normalize_before:
+                x = self.final_layer_norm(x)
+
+            x = self.activation_fn(self.fc1(x))
+            x = self.activation_dropout_module(x)
+            x = self.fc2(x)
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.final_layer_norm(x)
+
+            x_list[i] = x
+        # Trick for the checkpoint activation
+        x_list_tensor = torch.stack(x_list)
+        if self.onnx_trace and incremental_state is not None:
+            self_and_cross_attn_state_list = []
+            for i in range(n_channels):
+                self_and_cross_attn_state = []
+                for self_attn_module in [self.self_attn, self.cross_channel_attn]:
+                    saved_state = self_attn_module._get_input_buffer(
+                        incremental_state[i]
+                    )
+                    assert saved_state is not None
+                    if self_attn_padding_mask is not None:
+                        self_attn_module_state = [
+                            saved_state["prev_key"],
+                            saved_state["prev_value"],
+                            saved_state["prev_key_padding_mask"],
+                        ]
+                    else:
+                        self_attn_module_state = [
+                            saved_state["prev_key"],
+                            saved_state["prev_value"],
+                        ]
+                    self_and_cross_attn_state.append(self_attn_module_state)
+                self_and_cross_attn_state_list.append(tuple(self_and_cross_attn_state))
+            return x_list_tensor, attn_list, self_and_cross_attn_state_list
+        return x_list_tensor, attn_list, None
+
+    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+        self.need_attn = need_attn
+
+
+# Rewrite fairseq.modules.TransformerDecoderLayer
+# to be compatible with checkpoint_activations
+# (avoid forwarding model multiple times)
+class StandardTransformerDecoderLayer(nn.Module):
+    """Rewrite fairseq.modules.TransformerDecoderLayer to avoid forwarding
+    model multiple times and be compatible with checkpoint_activations.
+
+    The input is expected to be a list of tensors from different channels,
+    each is forwarded to the same model (shared attention weights).
+
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.decoder_normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.quant_noise = getattr(args, "quant_noise_pq", 0)
+        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+
+        self.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+        self.self_attn = self.build_self_attention(
+            self.embed_dim,
+            args,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+
+        self.activation_fn = utils.get_activation_fn(
+            activation=str(args.activation_fn)
+            if getattr(args, "activation_fn", None) is not None
+            else "relu"
+        )
+        activation_dropout_p = getattr(args, "activation_dropout", 0) or 0
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            activation_dropout_p = getattr(args, "relu_dropout", 0) or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = args.decoder_normalize_before
+
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, "char_inputs", False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            args.decoder_ffn_embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            args.decoder_ffn_embed_dim,
+            self.embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+
+        self.onnx_trace = False
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_self_attention(
+        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=not getattr(args, "cross_self_attention", False),
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def build_encoder_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            kdim=getattr(args, "encoder_embed_dim", None),
+            vdim=getattr(args, "encoder_embed_dim", None),
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+        )
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def residual_connection(self, x, residual):
+        return residual + x
+
+    def forward(
+        self,
+        x_list_tensor: List[torch.Tensor],
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        incremental_state: Optional[
+            List[Dict[str, Dict[str, Optional[Tensor]]]]
+        ] = None,
+        prev_self_attn_state: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+    ):
+        """
+        Args:
+            x_list_tensor (List[Tensor]): list of input tensors in different channels,
+                each tensor is of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            incremental_state (optional): list of incremental_state dictionaries over
+                different channels (sequence generation mode)
+            prev_self_attn_state (List[Tuple[Tensor, Tensor]], optional): list of tuples
+                (self_attn_state, cross_channel_attn_state) over different channels
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+
+        Returns:
+            list of encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        n_channels = len(x_list_tensor)
+        if need_head_weights:
+            need_attn = True
+
+        # incremental_state is a list of dictionaries over different channels
+        if incremental_state is not None:
+            assert isinstance(incremental_state, list)
+            assert len(incremental_state) == n_channels
+
+        # prev_self_attn_state is a list of self_attn_state over different channels
+        if prev_self_attn_state is not None:
+            assert isinstance(prev_self_attn_state, list)
+            assert len(prev_self_attn_state) == n_channels
+
+        x_list = []
+        attn_list = []
+        for i, x in enumerate(x_list_tensor):
+            residual = x
+
+            if self.normalize_before:
+                x = self.self_attn_layer_norm(x)
+
+            if prev_self_attn_state is not None:
+                prev_key, prev_value = prev_self_attn_state[i][:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_self_attn_state[i]) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+                assert incremental_state is not None
+                self.self_attn._set_input_buffer(incremental_state[i], saved_state)
+            _self_attn_input_buffer = self.self_attn._get_input_buffer(
+                incremental_state
+            )
+            if self.cross_self_attention and not (
+                incremental_state is not None
+                and _self_attn_input_buffer is not None
+                and "prev_key" in _self_attn_input_buffer
+            ):
+                if self_attn_mask is not None:
+                    assert encoder_out is not None
+                    self_attn_mask = torch.cat(
+                        (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask),
+                        dim=1,
+                    )
+                if self_attn_padding_mask is not None:
+                    if encoder_padding_mask is None:
+                        assert encoder_out is not None
+                        encoder_padding_mask = self_attn_padding_mask.new_zeros(
+                            encoder_out.size(1), encoder_out.size(0)
+                        )
+                    self_attn_padding_mask = torch.cat(
+                        (encoder_padding_mask, self_attn_padding_mask), dim=1
+                    )
+                assert encoder_out is not None
+                y = torch.cat((encoder_out, x), dim=0)
+            else:
+                y = x
+
+            x, attn = self.self_attn(
+                query=x,
+                key=y,
+                value=y,
+                key_padding_mask=self_attn_padding_mask,
+                incremental_state=incremental_state[i]
+                if incremental_state is not None
+                else None,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+            )
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.self_attn_layer_norm(x)
+
+            if self.encoder_attn is not None and encoder_out is not None:
+                residual = x
+                if self.normalize_before:
+                    x = self.encoder_attn_layer_norm(x)
+                if prev_attn_state is not None:
+                    prev_key, prev_value = prev_attn_state[:2]
+                    saved_state: Dict[str, Optional[Tensor]] = {
+                        "prev_key": prev_key,
+                        "prev_value": prev_value,
+                    }
+                    if len(prev_attn_state) >= 3:
+                        saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                    assert incremental_state is not None
+                    self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
+                x, attn = self.encoder_attn(
+                    query=x,
+                    key=encoder_out,
+                    value=encoder_out,
+                    key_padding_mask=encoder_padding_mask,
+                    incremental_state=incremental_state[i]
+                    if incremental_state is not None
+                    else None,
+                    static_kv=True,
+                    need_weights=need_attn or (not self.training and self.need_attn),
+                    need_head_weights=need_head_weights,
+                )
+                x = self.dropout_module(x)
+                x = self.residual_connection(x, residual)
+                if not self.normalize_before:
+                    x = self.encoder_attn_layer_norm(x)
+
+            residual = x
+            if self.normalize_before:
+                x = self.final_layer_norm(x)
+
+            x = self.activation_fn(self.fc1(x))
+            x = self.activation_dropout_module(x)
+            x = self.fc2(x)
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.final_layer_norm(x)
+
+            x_list.append(x)
+            attn_list.append(attn)
+
+        # Trick for the checkpoint activation
+        x_list_tensor = torch.stack(x_list)
+        if self.onnx_trace and incremental_state is not None:
+            self_attn_state_list = []
+            for i in range(n_channels):
+                saved_state = self.self_attn._get_input_buffer(incremental_state[i])
+                assert saved_state is not None
+                if self_attn_padding_mask is not None:
+                    self_attn_state = [
+                        saved_state["prev_key"],
+                        saved_state["prev_value"],
+                        saved_state["prev_key_padding_mask"],
+                    ]
+                else:
+                    self_attn_state = [
+                        saved_state["prev_key"],
+                        saved_state["prev_value"],
+                    ]
+                self_attn_state_list.append(self_attn_state)
+            return x_list_tensor, attn_list, self_attn_state_list
+        return x_list_tensor, attn_list, None
+
+    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+        self.need_attn = need_attn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
new file mode 100644
index 0000000000..a88e144577
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .multichannel_sequence_generator import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
new file mode 100644
index 0000000000..db4b77f345
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
@@ -0,0 +1,430 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+class MultichannelSearch(nn.Module):
+    def __init__(self, tgt_dicts):
+        super().__init__()
+        tgt_dict = list(tgt_dicts.values())[0]
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        for tgt_dict in tgt_dicts.values():
+            assert self.pad == tgt_dict.pad()
+            assert self.unk == tgt_dict.unk()
+            assert self.eos == tgt_dict.eos()
+        self.vocab_sizes = {channel: len(tgt_dicts[channel]) for channel in tgt_dicts}
+        self.src_lengths = torch.tensor(-1)
+        self.supports_constraints = False
+        self.stop_on_max_len = False
+
+    def step(
+        self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None
+    ):
+        """Take a single search step.
+
+        Args:
+            step: the current search step, starting at 0
+            lprobs: dictionary of channels {channel : (bsz x input_beam_size x vocab_size_channel)}
+                the model's log-probabilities over the vocabulary at the current step
+            scores: {channel : (bsz x input_beam_size x step)}
+                the historical model scores of each hypothesis up to this point
+            prev_output_tokens: {channel : (bsz x step)}
+                the previously generated oputput tokens
+            original_batch_idxs: (bsz)
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the orignal indices
+
+        Return: A tuple of (scores, indices, beams) where:
+            scores: {channel : (bsz x output_beam_size)}
+                the scores of the chosen elements; output_beam_size can be
+                larger than input_beam_size, e.g., we may return
+                2*input_beam_size to account for EOS
+            indices: {channel : (bsz x output_beam_size)}
+                the indices of the chosen elements
+            beams: (bsz x output_beam_size)
+                the hypothesis ids of the chosen elements, in the range [0, input_beam_size)
+        """
+        raise NotImplementedError
+
+    @torch.jit.export
+    def set_src_lengths(self, src_lengths):
+        self.src_lengths = src_lengths
+
+    @torch.jit.export
+    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
+        """Initialize constraint states for constrained decoding (if supported).
+
+        Args:
+            batch_constraints: (torch.Tensor, optional)
+                the list of constraints, in packed form
+            beam_size: (int)
+                the beam size
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        pass
+
+    def prune_sentences(self, batch_idxs: Tensor):
+        """
+        Removes constraint states for completed sentences (if supported).
+        This is called from sequence_generator._generate() when sentences are
+        deleted from the batch.
+
+        Args:
+            batch_idxs: Indices of *sentences* whose constraint state should be *kept*.
+        """
+        pass
+
+    def update_constraints(self, active_hypos: Tensor):
+        """
+        Updates the constraint states by selecting the beam items that are retained.
+        This is called at each time step of sequence_generator._generate() when
+        the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size.
+
+        Args:
+            active_hypos: (batch size, beam size)
+              list of integers denoting, for each sentence, which beam candidate items
+              should be kept.
+        """
+        pass
+
+
+def unravel_index(index, shape):
+    out = []
+    for dim in reversed(shape):
+        out.append(index % dim)
+        index = index // dim
+    return torch.stack(tuple(reversed(out)), dim=-1)
+
+
+def topk_sum(lprobs_list, k):
+    """
+    lprobs_list = [lprobs_1,...,lprobs_n], where:
+        lprobs_1 : (batch_size x beam_size x vocab_1)
+        ...
+        lprobs_n : (batch_size x beam_size x vocab_n)
+
+    Return:
+        - topk_values : (batch_size x k)
+            values of the topk sum of the form :
+                lprobs_1[bsz, beam_idx, vocab_1_idx] + ... + lprobs_n[bsz, beam_idx, vocab_n_idx]
+        - topk_idxs : (batch_size x k x n+1)
+            each (n+1)-tensor being [beam_idx, vocab_1_idx, ..., vocab_n_idx]
+    """
+    # Reduce all lprobs to k candidates first to reduce later complexity
+    # We may assume that k << vocab
+    lprobs_topk_list = []
+    lprobs_topk_indices_list = []
+    for lprobs in lprobs_list:
+        k_i = min(k, lprobs.size(-1))
+        topk_values, topk_indices = torch.topk(lprobs, k=k_i)
+        # topk_values : (batch_size x beam_size x k_i)
+        # topk_indices : (batch_size x beam_size x k_i)
+        lprobs_topk_list.append(topk_values)
+        lprobs_topk_indices_list.append(topk_indices)
+
+    # Compute all possible sums
+    sum_lprobs_topk = lprobs_topk_list[0]
+    for i in range(1, len(lprobs_topk_list)):
+        unsqueezed_lprobs = lprobs_topk_list[i]
+        for _ in range(i):
+            unsqueezed_lprobs = unsqueezed_lprobs.unsqueeze(-2)
+        sum_lprobs_topk = sum_lprobs_topk.unsqueeze(-1) + unsqueezed_lprobs
+    # sum_lprobs : (batch_size x beam_size x k_1 x ... x k_n)
+
+    # Get the top k sums and the (transformed indices)
+    topk_sum_values, topk_sum_indices = torch.topk(
+        sum_lprobs_topk.view(sum_lprobs_topk.size(0), -1), k=k
+    )
+    # topk_sum_values : (batch_size x k)
+    # topk_sum_indices : (batch_size x k)
+    topk_sum_indices = unravel_index(topk_sum_indices, tuple(sum_lprobs_topk.shape[1:]))
+    # topk_sum_indices : (batch_size x k x n+1)
+
+    # Convert the transformed indices to the true indices
+    for i_batch in range(topk_sum_indices.size(0)):
+        for i_cand in range(topk_sum_indices.size(1)):
+            i_beam, *transformed_vocab_indices = topk_sum_indices[i_batch, i_cand]
+            true_vocab_indices = [i_beam]
+            for j, transformed_vocab_j_idx in enumerate(transformed_vocab_indices):
+                true_vocab_j_idx = lprobs_topk_indices_list[j][
+                    i_batch, i_beam, transformed_vocab_j_idx
+                ]
+                true_vocab_indices.append(true_vocab_j_idx)
+            topk_sum_indices[i_batch, i_cand] = torch.tensor(true_vocab_indices)
+
+    topk_sum_beams = topk_sum_indices[:, :, 0]
+    topk_sum_indices = topk_sum_indices[:, :, 1:]
+
+    return topk_sum_values, topk_sum_indices, topk_sum_beams
+
+
+class MultichannelBeamSearch(MultichannelSearch):
+    def __init__(self, tgt_dicts):
+        super().__init__(tgt_dicts)
+        self.constraint_states = None
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores: Optional[Dict[str, Tensor]],
+        prev_output_tokens: Optional[Dict[str, Tensor]] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        channels = list(lprobs.keys())
+        bsz, beam_size, _ = lprobs[channels[0]].size()
+
+        lprobs_list = []
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            for channel in channels:
+                lprobs_list.append(lprobs[channel][:, ::beam_size, :].contiguous())
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            for channel in channels:
+                lprobs_list.append(
+                    lprobs[channel] + scores[channel][:, :, step - 1].unsqueeze(-1)
+                )
+
+        topk_sum_values, topk_sum_indices, topk_sum_beams = topk_sum(
+            lprobs_list, k=beam_size * 2
+        )
+
+        beams_buf = topk_sum_beams
+        scores_buf = {}
+        indices_buf = {}
+        for i, channel in enumerate(channels):
+            indices_buf[channel] = topk_sum_indices[:, :, i]
+            scores_buf[channel] = (
+                torch.tensor(
+                    [
+                        lprobs_list[i][i_batch, i_beam, i_index]
+                        for i_batch in range(bsz)
+                        for i_beam, i_index in zip(
+                            beams_buf[i_batch], indices_buf[channel][i_batch]
+                        )
+                    ]
+                )
+                .view(bsz, -1)
+                .to(lprobs_list[i].device)
+            )
+
+        # At this point, beams_buf and indices_buf are single-dim and contain relative indices
+        return scores_buf, indices_buf, beams_buf
+
+
+class ContiguousMultichannelBeamSearch(MultichannelSearch):
+    def __init__(self, tgt_dicts):
+        super().__init__(tgt_dicts)
+        self.constraint_states = None
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores: Optional[Tensor],
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        n_channels = len(lprobs)
+        bsz, beam_size, _ = lprobs[0].size()
+
+        lprobs_list = []
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            for i in range(n_channels):
+                lprobs_list.append(lprobs[i][:, ::beam_size, :].contiguous())
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            for i in range(n_channels):
+                lprobs_list.append(lprobs[i] + scores[:, :, step - 1, i].unsqueeze(-1))
+
+        topk_sum_values, topk_sum_indices, topk_sum_beams = topk_sum(
+            lprobs_list, k=beam_size * 2
+        )
+
+        beams_buf = topk_sum_beams
+        indices_buf = topk_sum_indices
+        scores_buf = (
+            torch.tensor(
+                [
+                    lprobs_list[i][i_batch, i_beam, i_index]
+                    for i in range(len(lprobs_list))
+                    for i_batch in range(bsz)
+                    for i_beam, i_index in zip(
+                        beams_buf[i_batch], indices_buf[i_batch, :, i]
+                    )
+                ]
+            )
+            .view(len(lprobs_list), bsz, -1)
+            .permute(1, 2, 0)
+            .to(lprobs_list[0].device)
+        )
+
+        # At this point, beams_buf and indices_buf are single-dim and contain relative indices
+        return scores_buf, indices_buf, beams_buf
+
+
+class ContiguousMultichannelSampling(MultichannelSearch):
+    sampling_topk: int
+    sampling_topp: float
+
+    def __init__(self, tgt_dicts, sampling_topk=-1, sampling_topp=-1.0):
+        super().__init__(tgt_dicts)
+        self.sampling_topk = sampling_topk
+        self.sampling_topp = sampling_topp
+
+    def _sample_topp(self, lprobs):
+        """Sample among the smallest set of elements whose cumulative probability mass exceeds p.
+
+        See `"The Curious Case of Neural Text Degeneration"
+        (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_.
+
+        Args:
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+
+        Return: A tuple of (trimed_probs, truncated_indices) where:
+            trimed_probs: (bsz x input_beam_size x ?)
+                the model's probabilities over the elements selected to sample from. The
+                width of the third dimension is determined by top-P.
+            truncated_indices: (bsz x input_beam_size x ?)
+                the indices of the chosen elements.
+        """
+        probs = lprobs.exp_()
+
+        # sort the last dimension (vocab dimension) in descending order
+        sorted_probs, sorted_indices = probs.sort(descending=True)
+
+        # compute a mask to indicate the words to be included in the top-P set.
+        cumsum_probs = sorted_probs.cumsum(dim=2)
+        mask = cumsum_probs.lt(self.sampling_topp)
+
+        # note that mask was computed by 'lt'. One more word needs to be included
+        # so that the cumulative probability mass can exceed p.
+        cumsum_mask = mask.cumsum(dim=2)
+        last_included = cumsum_mask[:, :, -1:]
+        last_included.clamp_(0, mask.size()[2] - 1)
+        mask = mask.scatter_(2, last_included, 1)
+
+        # truncate unnecessary dims.
+        max_dim = last_included.max()
+        truncated_mask = mask[:, :, : max_dim + 1]
+        truncated_probs = sorted_probs[:, :, : max_dim + 1]
+        truncated_indices = sorted_indices[:, :, : max_dim + 1]
+
+        # trim the words that are not in top-P by setting their probabilities
+        # to 0, so that they would not be sampled later.
+        trim_mask = ~truncated_mask
+        trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
+        return trimed_probs, truncated_indices
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        n_channels = len(lprobs)
+        bsz, beam_size, vocab_size = lprobs[0].size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            for i in range(n_channels):
+                lprobs[i] = lprobs[i][:, ::beam_size, :].contiguous()
+
+        probs = []
+        top_indices = []
+        for i in range(n_channels):
+            if self.sampling_topp > 0:
+                # only sample from the smallest set of words whose cumulative probability mass exceeds p
+                probs_i, top_indices_i = self._sample_topp(lprobs[i])
+            elif self.sampling_topk > 0:
+                # only sample from top-k candidates
+                lprobs[i], top_indices_i = lprobs[i].topk(
+                    min(self.sampling_topk, lprobs[i].size(-1))
+                )
+                probs_i = lprobs[i].exp_()
+            else:
+                probs_i = lprobs[i].exp_()
+
+                # dummy data to be consistent with true branch for type check
+                top_indices_i = torch.empty(0).to(probs_i)
+            probs.append(probs_i)
+            top_indices.append(top_indices_i)
+        # sample
+        indices_buf = []
+        for i in range(n_channels):
+            if step == 0:
+                indices_buf.append(
+                    torch.multinomial(
+                        probs[i].view(bsz, -1),
+                        beam_size,
+                        replacement=True,
+                    ).view(bsz, beam_size)
+                )
+            else:
+                indices_buf.append(
+                    torch.multinomial(
+                        probs[i].view(bsz * beam_size, -1),
+                        1,
+                        replacement=True,
+                    ).view(bsz, beam_size)
+                )
+
+        if step == 0:
+            for i in range(n_channels):
+                # expand to beam size
+                probs[i] = probs[i].expand(bsz, beam_size, -1)
+
+        # gather scores
+        scores_buf = []
+        for i in range(n_channels):
+            scores_buf.append(
+                torch.gather(probs[i], dim=2, index=indices_buf[i].unsqueeze(-1))
+            )
+            scores_buf[i] = scores_buf[i].log_().view(bsz, -1)
+
+        # remap indices if using top-k or top-P sampling
+        if self.sampling_topk > 0 or self.sampling_topp > 0:
+            for i in range(n_channels):
+                indices_buf[i] = torch.gather(
+                    top_indices[i].expand(bsz, beam_size, -1),
+                    dim=2,
+                    index=indices_buf[i].unsqueeze(-1),
+                ).squeeze(2)
+
+        if step == 0:
+            beams_buf = indices_buf[0].new_zeros(bsz, beam_size)
+        else:
+            beams_buf = torch.arange(0, beam_size).to(indices_buf[0]).repeat(bsz, 1)
+            # make scores cumulative
+            for i in range(n_channels):
+                scores_buf[i].add_(
+                    torch.gather(scores[:, :, step - 1, i], dim=1, index=beams_buf)
+                )
+        scores_buf = torch.stack(scores_buf, dim=-1)
+        indices_buf = torch.stack(indices_buf, dim=-1)
+
+        return scores_buf, indices_buf, beams_buf
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
new file mode 100644
index 0000000000..24807b866d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
@@ -0,0 +1,1110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List, Optional
+
+from omegaconf.listconfig import ListConfig
+from omegaconf.dictconfig import DictConfig
+
+import torch
+import torch.nn as nn
+from fairseq.models import FairseqIncrementalDecoder
+from torch import Tensor
+from fairseq.ngram_repeat_block import NGramRepeatBlock
+from .multichannel_search import ContiguousMultichannelBeamSearch
+from fairseq.models.speech_dlm import SpeechDLM
+
+
+class MultichannelSequenceGenerator(nn.Module):
+    def __init__(
+        self,
+        models,
+        tgt_dicts,
+        beam_size=1,
+        max_len_a=0,
+        max_len_b=200,
+        min_len=1,
+        normalize_scores=True,
+        len_penalty=1.0,
+        unk_penalty=0.0,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        search_strategy=None,
+        eos=None,
+        symbols_to_strip_from_output=None,
+        lm_model=None,
+        lm_weight=1.0,
+        duration_temperature=1.0,
+    ):
+        """Generate multi-channel parallel units with the SpeechDLM model
+        as described in the paper: https://arxiv.org/pdf/2203.16502.pdf;
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models,
+                currently support fairseq.models.TransformerModel for scripting
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            len_penalty (float, optional): length penalty, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+            duration_temperature (float, optional): rate of the duration prediction,
+                higher rate induces a faster generated wav (default: 1.0)
+        """
+        super().__init__()
+        if isinstance(models, MultichannelEnsembleModel):
+            self.model = models
+        else:
+            self.model = MultichannelEnsembleModel(models)
+        self.tgt_dicts = tgt_dicts
+        self.pad = list(tgt_dicts.values())[0].pad()
+        self.unk = list(tgt_dicts.values())[0].unk()
+        self.eos = list(tgt_dicts.values())[0].eos() if eos is None else eos
+        self.symbols_to_strip_from_output = (
+            symbols_to_strip_from_output.union({self.eos})
+            if symbols_to_strip_from_output is not None
+            else {self.eos}
+        )
+        self.channels = list(tgt_dicts.keys())
+        self.n_channels = len(self.channels)
+        self.vocab_sizes = [len(tgt_dicts[channel]) for channel in self.channels]
+        # the max beam size is the dictionary size - 1, since we never select pad
+        max_possible_beam_size = 1
+        for i in self.vocab_sizes:
+            max_possible_beam_size *= i - 1
+        self.beam_size = min(beam_size, max_possible_beam_size)
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.min_len = min_len
+
+        self.normalize_scores = normalize_scores
+        self.len_penalty = len_penalty
+        self.unk_penalty = unk_penalty
+        if isinstance(temperature, (int, float)):
+            temperature = {channel: temperature for channel in self.channels}
+        elif isinstance(temperature, ListConfig) or isinstance(temperature, list):
+            temperature = {
+                channel: temperature[i] for i, channel in enumerate(self.channels)
+            }
+        assert isinstance(temperature, DictConfig) or isinstance(
+            temperature, dict
+        ), f"temperature: expected dict, but found {type(temperature)}"
+        self.temperature = temperature
+        self.match_source_len = match_source_len
+
+        if no_repeat_ngram_size > 0:
+            self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size)
+        else:
+            self.repeat_ngram_blocker = None
+
+        for channel in temperature:
+            assert temperature[channel] > 0, "--temperature must be greater than 0"
+
+        if search_strategy is None:
+            self.search = ContiguousMultichannelBeamSearch(tgt_dicts)
+        else:
+            self.search = search_strategy
+        # We only need to set src_lengths in LengthConstrainedBeamSearch.
+        # As a module attribute, setting it would break in multithread
+        # settings when the model is shared.
+        self.should_set_src_lengths = (
+            hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths
+        )
+
+        self.model.eval()
+
+        self.lm_model = lm_model
+        self.lm_weight = lm_weight
+        if self.lm_model is not None:
+            self.lm_model.eval()
+
+        self.duration_prediction = bool(
+            str(getattr(models[0].decoder.args, "duration_prediction", "false")).lower()
+            == "true"
+        )
+        self.delayed_duration = bool(
+            str(
+                getattr(models[0].decoder.args, "delayed_duration_target", "false")
+            ).lower()
+            == "true"
+        )
+        self.duration_temperature = duration_temperature
+
+    def cuda(self):
+        self.model.cuda()
+        return self
+
+    @torch.no_grad()
+    def forward(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],  # TODO: Modify this
+        prefix_tokens: Optional[Dict[str, Tensor]] = None,
+        bos_token: Optional[int] = None,
+    ):
+        """Generate a batch of translations.
+
+        Args:
+            sample (dict): batch
+            prefix_tokens (dict of torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, prefix_tokens, bos_token=bos_token)
+
+    @torch.no_grad()
+    def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs):
+        """Generate translations. Match the api of other fairseq generators.
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            sample (dict): batch
+            prefix_tokens (dict of torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            constraints (torch.LongTensor, optional): force decoder to include
+                the list of constraints
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, **kwargs)
+
+    def _generate(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Dict[str, Tensor]] = None,
+        constraints: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        """
+        Here sample is expected to have the following form
+            {
+                'id': index,
+                'net_input': {
+                    'src_tokens': {
+                        'channel1' : tensor((batch x src_length)),
+                        'channel2' : tensor((batch x src_length)),
+                    },
+                    ...
+                },
+            }
+        and prefix_tokens
+            {
+                'channel1' : tensor((batch x prefix_length)),
+                'channel2' : tensor((batch x prefix_length)),
+            }
+        """
+        if self.model.is_speech_dlm:
+            incremental_states = torch.jit.annotate(
+                List[Dict[str, Dict[str, Optional[Tensor]]]],
+                [
+                    torch.jit.annotate(
+                        List[Dict[str, Dict[str, Optional[Tensor]]]],
+                        [{} for _ in range(self.n_channels)],
+                    )
+                    for i in range(self.model.models_size)
+                ],
+            )
+        else:
+            incremental_states = torch.jit.annotate(
+                List[Dict[str, Dict[str, Optional[Tensor]]]],
+                [
+                    torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                    for i in range(self.model.models_size)
+                ],
+            )
+        net_input = sample["net_input"]
+        # Convert from dict to tensor form
+        # shape of src_tokens : (bsz x src_len x n_channels)
+        src_tokens = torch.stack(
+            [net_input["src_tokens"][channel] for channel in self.channels], dim=-1
+        )
+        prefix_tokens = torch.stack(
+            [prefix_tokens[channel] for channel in self.channels], dim=-1
+        )
+        # length of the source text being the character length except EndOfSentence and pad
+        src_lengths = (
+            (src_tokens[..., 0].ne(self.eos) & src_tokens[..., 0].ne(self.pad))
+            .long()
+            .sum(dim=1)
+        )
+
+        # bsz: total number of sentences in beam
+        # Note that src_tokens may have more than 2 dimensions (i.e. audio features)
+        bsz, src_len = src_tokens.size()[:2]
+        beam_size = self.beam_size
+
+        if constraints is not None and not self.search.supports_constraints:
+            raise NotImplementedError(
+                "Target-side constraints were provided, but search method doesn't support them"
+            )
+
+        # Initialize constraints, when active
+        self.search.init_constraints(constraints, beam_size)
+
+        max_len: int = -1
+        if self.match_source_len:
+            max_len = src_lengths.max().item()
+        else:
+            max_len = min(
+                int(self.max_len_a * src_len + self.max_len_b),
+                # exclude the EOS marker
+                self.model.max_decoder_positions() - 1,
+            )
+        assert (
+            self.min_len <= max_len
+        ), "min_len cannot be larger than max_len, please adjust these!"
+        # compute the encoder output for each beam
+        encoder_outs = self.model.forward_encoder(net_input)
+
+        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
+        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
+        new_order = new_order.to(src_tokens.device).long()
+        encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order)
+        # ensure encoder_outs is a List.
+        assert encoder_outs is not None
+
+        # initialize buffers
+        # cumulative scores of hypotheses
+        scores = (
+            torch.zeros(bsz * beam_size, max_len + 1, self.n_channels)
+            .to(src_tokens)
+            .float()
+        )  # +1 for eos; pad is never chosen for scoring
+        tokens = (
+            torch.zeros(bsz * beam_size, max_len + 2, self.n_channels)
+            .to(src_tokens)
+            .long()
+            .fill_(self.pad)
+        )  # +2 for eos and pad
+        tokens[:, 0] = self.eos if bos_token is None else bos_token
+        attn: Optional[Tensor] = None
+
+        # A list that indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        cands_to_ignore = (
+            torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
+        )  # forward and backward-compatible False mask
+
+        # list of completed sentences
+        finalized = torch.jit.annotate(
+            List[List[Dict[str, Tensor]]],
+            [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)],
+        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step
+
+        finished = [
+            False for i in range(bsz)
+        ]  # a boolean array indicating if the sentence at the index is finished or not
+        num_remaining_sent = bsz  # number of sentences remaining
+
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (
+            (torch.arange(0, bsz) * beam_size)
+            .unsqueeze(1)
+            .type_as(tokens)
+            .to(src_tokens.device)
+        )
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device)
+
+        reorder_state: Optional[Tensor] = None
+        batch_idxs: Optional[Tensor] = None
+
+        original_batch_idxs: Optional[Tensor] = None
+        if "id" in sample and isinstance(sample["id"], Tensor):
+            original_batch_idxs = sample["id"]
+        else:
+            original_batch_idxs = torch.arange(0, bsz).type_as(tokens)
+
+        if self.duration_prediction:
+            dur_counter = torch.ones(bsz * beam_size, self.n_channels).to(src_tokens)
+            # save the indice where the dur_counter just copied from dur_pred
+            dur_counter_jump_indices = None
+
+        for step in range(max_len + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            if reorder_state is not None:
+                if batch_idxs is not None:
+                    # update beam indices to take into account removed sentences
+                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(
+                        batch_idxs
+                    )
+                    reorder_state.view(-1, beam_size).add_(
+                        corr.unsqueeze(-1) * beam_size
+                    )
+                    original_batch_idxs = original_batch_idxs[batch_idxs]
+                self.model.reorder_incremental_state(incremental_states, reorder_state)
+                encoder_outs = self.model.reorder_encoder_out(
+                    encoder_outs, reorder_state
+                )
+
+            input_tokens = {
+                channel: tokens[:, : step + 1, i]
+                for i, channel in enumerate(self.channels)
+            }
+
+            lprobs_dict, avg_attn_scores = self.model.forward_decoder(
+                input_tokens,
+                encoder_outs,
+                incremental_states,
+                self.temperature,
+            )
+
+            # Because the sizes of vocab is different, we cannot concat the lprobs to form a single tensor
+            if not self.duration_prediction:
+                lprobs_list = list(lprobs_dict.values())
+            else:
+                lprobs_list = [
+                    net_output["pred_token"] for net_output in lprobs_dict.values()
+                ]
+
+                # non-positive predicted durations
+                dur_preds = (
+                    torch.stack(
+                        [
+                            net_output["pred_duration"]
+                            for net_output in lprobs_dict.values()
+                        ]
+                    )
+                    .squeeze(-1)
+                    .T
+                )
+                dur_preds = dur_preds / self.duration_temperature
+                dur_preds = dur_preds.round().long()
+                dur_preds[dur_preds < 1] = 1
+
+                # dur_preds & dur_counter needs to be modified when there isn't an edge
+                if step > 0:
+                    non_edge_indices = tokens[:, step, :] == tokens[:, step - 1, :]
+                    if self.delayed_duration:
+                        dur_preds[non_edge_indices] = 1
+                    else:
+                        if dur_counter_jump_indices is not None:
+                            dur_counter[dur_counter_jump_indices & non_edge_indices] = 2
+
+                # update dur_counter
+                if step > 0:
+                    if self.delayed_duration:
+                        dur_counter -= (
+                            (dur_counter == 1)
+                            | (tokens[:, step, :] == tokens[:, step - 1, :])
+                        ).int()
+                        dur_counter[dur_counter < 0] = 0
+                    else:
+                        dur_counter -= (
+                            tokens[:, step, :] == tokens[:, step - 1, :]
+                        ).int()
+                        dur_counter[dur_counter < 1] = 1
+
+                # whether to copy previous token (ie. if the counter is still on)
+                # and get get the new duration
+                if self.delayed_duration:
+                    dur_counter_jump_indices = dur_counter == 0
+                    dur_counter[dur_counter_jump_indices] = dur_preds[
+                        dur_counter_jump_indices
+                    ]
+
+                # whether to copy previous token in this step
+                copy_prev_token = dur_counter != 1
+                if self.delayed_duration is False:
+                    dur_counter_jump_indices = dur_counter == 1
+                    dur_counter[dur_counter_jump_indices] = dur_preds[
+                        dur_counter_jump_indices
+                    ]
+                # else:
+                # dur_counter[dur_counter==0] = dur_preds[dur_counter==0] - 1
+                # copy_prev_token = (dur_counter > 0)
+
+            if self.lm_model is not None:
+                assert False, "Currently not supported in multichannelLM case"
+
+            for i in range(self.n_channels):
+                lprobs_list[i][lprobs_list[i] != lprobs_list[i]] = torch.tensor(
+                    -math.inf
+                ).to(lprobs_list[i])
+
+                lprobs_list[i][:, self.pad] = -math.inf  # never select pad
+                lprobs_list[i][:, self.unk] -= self.unk_penalty  # apply unk penalty
+
+                # handle max length constraint
+                if step >= max_len:
+                    lprobs_list[i][:, : self.eos] = -math.inf
+                    lprobs_list[i][:, self.eos + 1 :] = -math.inf
+                else:
+                    lprobs_list[i][
+                        :, self.eos
+                    ] = -math.inf  # quick fix for short generation
+
+                # handle prefix tokens (possibly with different lengths)
+                if (
+                    prefix_tokens is not None
+                    and step < prefix_tokens.size(1)
+                    and step < max_len
+                ):
+                    (
+                        lprobs_list[i],
+                        tokens[..., i],
+                        scores[..., i],
+                    ) = self._prefix_tokens(
+                        step,
+                        lprobs_list[i],
+                        scores[..., i],
+                        tokens[..., i],
+                        prefix_tokens[..., i],
+                        beam_size,
+                    )
+                    if self.duration_prediction:
+                        # Can copy previous token if the prefix token is padding or unk (1-channel conditionned case)
+                        can_copy_mask = (
+                            prefix_tokens[:, step, i].eq(self.pad)
+                            | prefix_tokens[:, step, i].eq(self.unk)
+                        ).repeat_interleave(beam_size)
+                        copy_prev_token[:, i] &= can_copy_mask
+                elif step < self.min_len:
+                    # minimum length constraint (does not apply if using prefix_tokens)
+                    lprobs_list[i][:, self.eos] = -math.inf
+
+                if self.duration_prediction:
+                    if step < max_len:
+                        for j in range(copy_prev_token.size(0)):
+                            if copy_prev_token[j, i]:
+                                prev_token = tokens[j, step, i]
+                                lprobs_list[i][j, :prev_token] = -math.inf
+                                lprobs_list[i][j, prev_token + 1 :] = -math.inf
+                                # lprobs_list[i][j, prev_token] = 0.
+                                # dur_counter[j,i] -= 1
+                            # else:
+                            #     prev_token = tokens[j, step, i]
+                            # if not (lprobs_list[i][j,:].ne(-math.inf).nonzero() == prev_token).all():
+                            #     lprobs_list[i][j, prev_token] = -math.inf
+                            #     dur_counter[j,i] = 0.
+
+            # Record attention scores, only support avg_attn_scores is a Tensor
+            if avg_attn_scores is not None:
+                if attn is None:
+                    attn = torch.empty(
+                        bsz * beam_size, avg_attn_scores.size(1), max_len + 2
+                    ).to(scores)
+                attn[:, :, step + 1].copy_(avg_attn_scores)
+
+            scores = scores.type_as(lprobs_list[0])
+            eos_bbsz_idx = torch.empty(0).to(
+                tokens
+            )  # indices of hypothesis ending with eos (finished sentences)
+            eos_scores = torch.empty(0).to(
+                scores
+            )  # scores of hypothesis ending with eos (finished sentences)
+
+            if self.should_set_src_lengths:
+                self.search.set_src_lengths(src_lengths)
+
+            if self.repeat_ngram_blocker is not None:
+                for i in range(self.n_channels):
+                    lprobs_list[i] = self.repeat_ngram_blocker(
+                        tokens, lprobs_list[i], bsz, beam_size, step
+                    )
+
+            # Shape: (batch, cand_size)
+            cand_scores, cand_indices, cand_beams = self.search.step(
+                step,
+                [
+                    lprobs_list[i].view(bsz, -1, self.vocab_sizes[i])
+                    for i in range(self.n_channels)
+                ],
+                scores.view(bsz, beam_size, -1, self.n_channels)[:, :, :step, :],
+                tokens[:, : step + 1],
+                original_batch_idxs,
+            )
+
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+
+            # finalize hypotheses that end in eos
+            # Shape of eos_mask: (batch size, beam size)
+            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
+            eos_mask = torch.any(eos_mask, dim=-1, keepdim=False)
+            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask)
+
+            # only consider eos when it's among the top beam_size indices
+            # Now we know what beam item(s) to finish
+            # Shape: 1d list of absolute-numbered
+            eos_bbsz_idx = torch.masked_select(
+                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
+            )
+
+            finalized_sents: List[int] = []
+            if eos_bbsz_idx.numel() > 0:
+                eos_scores = torch.stack(
+                    [
+                        torch.masked_select(
+                            cand_scores[:, :beam_size, i], mask=eos_mask[:, :beam_size]
+                        )
+                        for i in range(self.n_channels)
+                    ],
+                    dim=-1,
+                )
+                finalized_sents = self.finalize_hypos(
+                    step,
+                    eos_bbsz_idx,
+                    eos_scores,
+                    tokens,
+                    scores,
+                    finalized,
+                    finished,
+                    beam_size,
+                    attn,
+                    src_lengths,
+                    max_len,
+                )
+                num_remaining_sent -= len(finalized_sents)
+
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+            if self.search.stop_on_max_len and step >= max_len:
+                break
+            assert step < max_len, f"{step} < {max_len}"
+
+            # Remove finalized sentences (ones for which {beam_size}
+            # finished hypotheses have been generated) from the batch.
+            if len(finalized_sents) > 0:
+                new_bsz = bsz - len(finalized_sents)
+
+                # construct batch_idxs which holds indices of batches to keep for the next pass
+                batch_mask = torch.ones(
+                    bsz, dtype=torch.bool, device=cand_indices.device
+                )
+                batch_mask[finalized_sents] = False
+                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
+                batch_idxs = torch.arange(
+                    bsz, device=cand_indices.device
+                ).masked_select(batch_mask)
+
+                # Choose the subset of the hypothesized constraints that will continue
+                self.search.prune_sentences(batch_idxs)
+
+                eos_mask = eos_mask[batch_idxs]
+                cand_beams = cand_beams[batch_idxs]
+                bbsz_offsets.resize_(new_bsz, 1)
+                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+                cand_scores = cand_scores[batch_idxs]
+                cand_indices = cand_indices[batch_idxs]
+
+                if prefix_tokens is not None:
+                    prefix_tokens = prefix_tokens[batch_idxs]
+                src_lengths = src_lengths[batch_idxs]
+                cands_to_ignore = cands_to_ignore[batch_idxs]
+
+                scores = scores.view(bsz, -1)[batch_idxs].view(
+                    new_bsz * beam_size, -1, self.n_channels
+                )
+                tokens = tokens.view(bsz, -1)[batch_idxs].view(
+                    new_bsz * beam_size, -1, self.n_channels
+                )
+                if self.duration_prediction:
+                    dur_counter = dur_counter.view(bsz, -1)[batch_idxs].view(
+                        new_bsz * beam_size, self.n_channels
+                    )
+                if attn is not None:
+                    attn = attn.view(bsz, -1)[batch_idxs].view(
+                        new_bsz * beam_size, attn.size(1), -1
+                    )
+                bsz = new_bsz
+            else:
+                batch_idxs = None
+
+            # Set active_mask so that values > cand_size indicate eos hypos
+            # and values < cand_size indicate candidate active hypos.
+            # After, the min values per row are the top candidate active hypos
+
+            # Rewrite the operator since the element wise or is not supported in torchscript.
+
+            eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size]))
+            active_mask = torch.add(
+                eos_mask.type_as(cand_offsets) * cand_size,
+                cand_offsets[: eos_mask.size(1)],
+            )
+
+            # get the top beam_size active hypotheses, which are just
+            # the hypos with the smallest values in active_mask.
+            # {active_hypos} indicates which {beam_size} hypotheses
+            # from the list of {2 * beam_size} candidates were
+            # selected. Shapes: (batch size, beam size)
+            new_cands_to_ignore, active_hypos = torch.topk(
+                active_mask, k=beam_size, dim=1, largest=False
+            )
+
+            # update cands_to_ignore to ignore any finalized hypos.
+            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
+            # Make sure there is at least one active item for each sentence in the batch.
+            assert (~cands_to_ignore).any(dim=1).all()
+
+            # update cands_to_ignore to ignore any finalized hypos
+            # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
+            # can be selected more than once).
+            active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+
+            # active_scores = torch.stack([
+            #     torch.gather(cand_scores[...,0], dim=1, index=active_hypos)
+            #         for i in range(self.n_channels)
+            #         ], dim = -1)
+            # active_scores = active_scores.view(-1)
+
+            # copy tokens and scores for active hypotheses
+
+            # Set the tokens for each beam (can select the same row more than once)
+            tokens[:, : step + 1] = torch.index_select(
+                tokens[:, : step + 1], dim=0, index=active_bbsz_idx
+            )
+            # Select the next token for each of them
+            for i in range(self.n_channels):
+                tokens.view(bsz, beam_size, -1, self.n_channels)[
+                    :, :, step + 1, i
+                ] = torch.gather(cand_indices[..., i], dim=1, index=active_hypos)
+            if step > 0:
+                scores[:, :step] = torch.index_select(
+                    scores[:, :step], dim=0, index=active_bbsz_idx
+                )
+            for i in range(self.n_channels):
+                scores.view(bsz, beam_size, -1, self.n_channels)[
+                    :, :, step, i
+                ] = torch.gather(cand_scores[..., i], dim=1, index=active_hypos)
+
+            if self.duration_prediction:
+                dur_counter = torch.index_select(
+                    dur_counter, dim=0, index=active_bbsz_idx
+                )
+
+            # Update constraints based on which candidates were selected for the next beam
+            self.search.update_constraints(active_hypos)
+
+            # copy attention for active hypotheses
+            if attn is not None:
+                attn[:, :, : step + 2] = torch.index_select(
+                    attn[:, :, : step + 2], dim=0, index=active_bbsz_idx
+                )
+
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+
+        # sort by score descending
+        for sent in range(len(finalized)):
+            scores = torch.tensor(
+                [float(elem["score"].item()) for elem in finalized[sent]]
+            )
+            _, sorted_scores_indices = torch.sort(scores, descending=True)
+            finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices]
+            finalized[sent] = torch.jit.annotate(
+                List[Dict[str, Tensor]], finalized[sent]
+            )
+        return finalized
+
+    def _prefix_tokens(
+        self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int
+    ):
+        """Handle prefix tokens"""
+        prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
+        prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+        prefix_mask = prefix_toks.ne(self.pad)
+        # used for 1-channel generation, do not force the unk token (i.e. unk tokens are changed)
+        prefix_mask &= prefix_toks.ne(self.unk)
+        # zeroing the copying tokens
+        # if step > 0:
+        #     copy_mask = (prefix_tokens[:, step] == prefix_tokens[:, step-1]).unsqueeze(-1).repeat(1, beam_size).view(-1)
+        #     prefix_lprobs[copy_mask & prefix_mask] = 0.
+        lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs)
+        lprobs[prefix_mask] = lprobs[prefix_mask].scatter(
+            -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask]
+        )
+        # shouldn't stop at unk token
+        unk_mask = prefix_toks.eq(self.unk)
+        if len(lprobs[unk_mask]) > 0:
+            # otherwise it won't assign to lprobs,
+            # see: https://discuss.pytorch.org/t/how-to-mask-and-assign-a-value-to-tensor/18437
+            copy_lprobs = lprobs[unk_mask][:, :]
+            copy_lprobs[:, self.eos] = -math.inf
+            lprobs[unk_mask] = copy_lprobs
+        # if prefix includes eos, then we should make sure tokens and
+        # scores are the same across all beams
+        eos_mask = prefix_toks.eq(self.eos)
+        if eos_mask.any():
+            # validate that the first beam matches the prefix
+            first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[
+                :, 0, 1 : step + 1
+            ]
+            eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
+            target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
+            assert (first_beam == target_prefix).all()
+
+            # copy tokens, scores and lprobs from the first beam to all beams
+            tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size)
+            scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size)
+            lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size)
+        return lprobs, tokens, scores
+
+    def replicate_first_beam(self, tensor, mask, beam_size: int):
+        tensor = tensor.view(-1, beam_size, tensor.size(-1))
+        tensor[mask] = tensor[mask][:, :1, :]
+        return tensor.view(-1, tensor.size(-1))
+
+    def finalize_hypos(
+        self,
+        step: int,
+        bbsz_idx,
+        eos_scores,
+        tokens,
+        scores,
+        finalized: List[List[Dict[str, Tensor]]],
+        finished: List[bool],
+        beam_size: int,
+        attn: Optional[Tensor],
+        src_lengths,
+        max_len: int,
+    ):
+        """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
+        A sentence is finalized when {beam_size} finished items have been collected for it.
+
+        Returns number of sentences (not beam items) being finalized.
+        These will be removed from the batch and not processed further.
+        Args:
+            bbsz_idx (Tensor):
+        """
+        assert bbsz_idx.numel() == eos_scores.size(0)
+
+        # clone relevant token and attention tensors.
+        # tokens is (batch * beam, max_len). So the index_select
+        # gets the newly EOS rows, then selects cols 1..{step + 2}
+        tokens_clone = tokens.index_select(0, bbsz_idx)[
+            :, 1 : step + 2
+        ]  # skip the first index, which is EOS
+
+        tokens_clone[:, step] = self.eos
+        attn_clone = (
+            attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2]
+            if attn is not None
+            else None
+        )
+
+        # compute scores per token position
+        pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1]
+        pos_scores[:, step, :] = eos_scores
+        # convert from cumulative to per-position scores
+        pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
+
+        # normalize sentence-level scores
+        if self.normalize_scores:
+            eos_scores /= (step + 1) ** self.len_penalty
+
+        # cum_unfin records which sentences in the batch are finished.
+        # It helps match indexing between (a) the original sentences
+        # in the batch and (b) the current, possibly-reduced set of
+        # sentences.
+        cum_unfin: List[int] = []
+        prev = 0
+        for f in finished:
+            if f:
+                prev += 1
+            else:
+                cum_unfin.append(prev)
+
+        # The keys here are of the form "{sent}_{unfin_idx}", where
+        # "unfin_idx" is the index in the current (possibly reduced)
+        # list of sentences, and "sent" is the index in the original,
+        # unreduced batch
+        # set() is not supported in script export
+        sents_seen: Dict[str, Optional[Tensor]] = {}
+
+        # For every finished beam item
+        for i in range(bbsz_idx.size()[0]):
+            idx = bbsz_idx[i]
+            score = eos_scores[i].sum()
+            # sentence index in the current (possibly reduced) batch
+            unfin_idx = idx // beam_size
+            # sentence index in the original (unreduced) batch
+            sent = unfin_idx + cum_unfin[unfin_idx]
+            # Cannot create dict for key type '(int, int)' in torchscript.
+            # The workaround is to cast int to string
+            seen = str(sent.item()) + "_" + str(unfin_idx.item())
+            if seen not in sents_seen:
+                sents_seen[seen] = None
+
+            if self.match_source_len and step > src_lengths[unfin_idx]:
+                score = torch.tensor(-math.inf).to(score)
+
+            # An input sentence (among those in a batch) is finished when
+            # beam_size hypotheses have been collected for it
+            if len(finalized[sent]) < beam_size:
+                if attn_clone is not None:
+                    # remove padding tokens from attn scores
+                    hypo_attn = attn_clone[i]
+                else:
+                    hypo_attn = torch.empty(0)
+
+                finalized[sent].append(
+                    {
+                        "tokens": tokens_clone[i],
+                        "score": score,
+                        "attention": hypo_attn,  # src_len x tgt_len
+                        "alignment": torch.empty(0),
+                        "positional_scores": pos_scores[i],
+                    }
+                )
+
+        newly_finished: List[int] = []
+
+        for seen in sents_seen.keys():
+            # check termination conditions for this sentence
+            sent: int = int(float(seen.split("_")[0]))
+            unfin_idx: int = int(float(seen.split("_")[1]))
+
+            if not finished[sent] and self.is_finished(
+                step, unfin_idx, max_len, len(finalized[sent]), beam_size
+            ):
+                finished[sent] = True
+                newly_finished.append(unfin_idx)
+
+        return newly_finished
+
+    def is_finished(
+        self,
+        step: int,
+        unfin_idx: int,
+        max_len: int,
+        finalized_sent_len: int,
+        beam_size: int,
+    ):
+        """
+        Check whether decoding for a sentence is finished, which
+        occurs when the list of finalized sentences has reached the
+        beam size, or when we reach the maximum length.
+        """
+        assert finalized_sent_len <= beam_size
+        if finalized_sent_len == beam_size or step == max_len:
+            return True
+        return False
+
+
+class MultichannelEnsembleModel(nn.Module):
+    """A wrapper around an ensemble of SpeechDLM models."""
+
+    def __init__(self, models):
+        super().__init__()
+        self.models_size = len(models)
+        # method '__len__' is not supported in ModuleList for torch script
+        self.single_model = models[0]
+        self.models = nn.ModuleList(models)
+
+        self.has_incremental: bool = False
+        if all(
+            hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder)
+            for m in models
+        ):
+            self.has_incremental = True
+
+        if isinstance(models[0], SpeechDLM):
+            self.is_speech_dlm = True
+        # Otherwise it's a multi-channel language model (without cross-prediction outputs)
+        else:
+            self.is_speech_dlm = False
+
+        if getattr(models[0].decoder.args, "duration_prediction", False):
+            self.is_duration_prediction = True
+        else:
+            self.is_duration_prediction = False
+
+    def forward(self):
+        pass
+
+    def has_encoder(self):
+        return hasattr(self.single_model, "encoder")
+
+    def has_incremental_states(self):
+        return self.has_incremental
+
+    def max_decoder_positions(self):
+        return min([m.max_decoder_positions() for m in self.models])
+
+    @torch.jit.export
+    def forward_encoder(self, net_input: Dict[str, Tensor]):
+        if not self.has_encoder():
+            return None
+        return [model.encoder.forward_torchscript(net_input) for model in self.models]
+
+    @torch.jit.export
+    def forward_decoder(
+        self,
+        tokens,
+        encoder_outs: List[Dict[str, List[Tensor]]],
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        temperature: Dict[str, float] = 1.0,
+    ):
+        if isinstance(temperature, (float, int)):
+            temperature = {channel: temperature for channel in tokens}
+        log_probs = {channel: [] for channel in tokens}
+        avg_attn: Optional[Tensor] = None
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None
+        for i, model in enumerate(self.models):
+            if self.has_encoder():
+                encoder_out = encoder_outs[i]
+            # decode each model
+            if self.has_incremental_states():
+                decoder_out = model.decoder.forward(
+                    tokens,
+                    encoder_out=encoder_out,
+                    incremental_state=incremental_states[i],
+                )
+            else:
+                decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out)
+
+            attn: Optional[Tensor] = None
+            decoder_len = len(decoder_out)
+            if decoder_len > 1 and decoder_out[1] is not None:
+                if isinstance(decoder_out[1], Tensor):
+                    attn = decoder_out[1]
+                else:
+                    attn_holder = decoder_out[1]["attn"]
+                    if isinstance(attn_holder, Tensor):
+                        attn = attn_holder
+                    elif attn_holder is not None:
+                        attn = attn_holder[0]
+                if attn is not None:
+                    attn = attn[:, -1, :]
+
+            if self.is_speech_dlm:
+                if self.is_duration_prediction:
+                    decoder_out_divided_by_temperature = {
+                        channel_src: {
+                            channel_pred: {
+                                "pred_token": decoder_out[0][channel_src][channel_pred][
+                                    "pred_token"
+                                ][:, -1:, :].div_(temperature[channel_pred]),
+                                "pred_duration": decoder_out[0][channel_src][
+                                    channel_pred
+                                ]["pred_duration"][:, -1:, :],
+                            }
+                            for channel_pred in decoder_out[0][channel_src]
+                        }
+                        for channel_src in decoder_out[0]
+                    }
+                else:
+                    decoder_out_divided_by_temperature = {
+                        channel_src: {
+                            channel_pred: decoder_out[0][channel_src][channel_pred][
+                                :, -1:, :
+                            ].div_(temperature[channel_pred])
+                            for channel_pred in decoder_out[0][channel_src]
+                        }
+                        for channel_src in decoder_out[0]
+                    }
+            else:
+                decoder_out_divided_by_temperature = {
+                    channel: decoder_out[0][channel][:, -1:, :].div_(
+                        temperature[channel]
+                    )
+                    for channel in decoder_out[0]
+                }
+            decoder_out_tuple = (
+                decoder_out_divided_by_temperature,
+                None if decoder_len <= 1 else decoder_out[1],
+            )
+
+            probs = model.get_normalized_probs(
+                decoder_out_tuple, log_probs=True, sample=None
+            )
+
+            if self.is_speech_dlm:
+                if self.is_duration_prediction:
+                    probs = {
+                        channel: {
+                            "pred_token": probs[channel][channel]["pred_token"][
+                                :, -1, :
+                            ],
+                            "pred_duration": probs[channel][channel]["pred_duration"][
+                                :, -1, :
+                            ],
+                        }
+                        for channel in probs
+                    }
+                else:
+                    probs = {
+                        channel: probs[channel][channel][:, -1, :] for channel in probs
+                    }
+            else:
+                probs = {channel: probs[channel][:, -1, :] for channel in probs}
+            if self.models_size == 1:
+                return probs, attn
+
+            for channel in probs:
+                log_probs[channel].append(probs[channel])
+            if attn is not None:
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+
+        avg_probs = {}
+        for channel in log_probs:
+            avg_probs[channel] = torch.logsumexp(
+                torch.stack(log_probs[channel], dim=0), dim=0
+            ) - math.log(self.models_size)
+
+        if avg_attn is not None:
+            avg_attn.div_(self.models_size)
+        return avg_probs, avg_attn
+
+    @torch.jit.export
+    def reorder_encoder_out(
+        self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order
+    ):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        new_outs: List[Dict[str, List[Tensor]]] = []
+        if not self.has_encoder():
+            return new_outs
+        for i, model in enumerate(self.models):
+            assert encoder_outs is not None
+            new_outs.append(
+                model.encoder.reorder_encoder_out(encoder_outs[i], new_order)
+            )
+        return new_outs
+
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order,
+    ):
+        if not self.has_incremental_states():
+            return
+        for i, model in enumerate(self.models):
+            model.decoder.reorder_incremental_state_scripting(
+                incremental_states[i], new_order
+            )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
new file mode 100644
index 0000000000..dc13f565f1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
@@ -0,0 +1,280 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+
+from fairseq import utils
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import Embedding
+from .modules.speech_dlm_decoder import CrossChannelTransformerDecoder
+from omegaconf import II
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SpeechDLMConfig(FairseqDataclass):
+    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
+        default="relu", metadata={"help": "activation function to use"}
+    )
+    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability for attention weights"}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
+    )
+    relu_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
+    )
+    decoder_embed_dim: int = field(
+        default=512, metadata={"help": "decoder embedding dimension"}
+    )
+    decoder_output_dim: int = field(
+        default=512, metadata={"help": "decoder output dimension"}
+    )
+    decoder_input_dim: int = field(
+        default=512, metadata={"help": "decoder input dimension"}
+    )
+    decoder_ffn_embed_dim: int = field(
+        default=2048, metadata={"help": "decoder embedding dimension for FFN"}
+    )
+    decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"})
+    decoder_cross_layers: int = field(
+        default=-1, metadata={"help": "num self cross attention decoder layers"}
+    )
+    decoder_attention_heads: int = field(
+        default=8, metadata={"help": "num decoder attention heads"}
+    )
+    decoder_normalize_before: bool = field(
+        default=False, metadata={"help": "apply layernorm before each decoder block"}
+    )
+    no_decoder_final_norm: bool = field(
+        default=False,
+        metadata={"help": "don't add an extra layernorm after the last decoder block"},
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, disables positional embeddings (outside self attention)"
+        },
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False, metadata={"help": "share decoder input and output embeddings"}
+    )
+    decoder_learned_pos: bool = field(
+        default=False,
+        metadata={"help": "use learned positional embeddings in the decoder"},
+    )
+    decoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "LayerDrop probability for decoder"}
+    )
+    decoder_layers_to_keep: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "which layers to *keep* when pruning as a comma-separated list"
+        },
+    )
+    layernorm_embedding: bool = field(
+        default=False, metadata={"help": "add layernorm to embedding"}
+    )
+    no_scale_embedding: bool = field(
+        default=False, metadata={"help": "if True, dont scale embeddings"}
+    )
+    checkpoint_activations: bool = field(
+        default=False, metadata={"help": "checkpoint activations at each layer"}
+    )
+    offload_activations: bool = field(
+        default=False,
+        metadata={"help": "move checkpointed activations to CPU after they are used."},
+    )
+    quant_noise_pq: float = field(
+        default=0.0,
+        metadata={"help": "iterative PQ quantization noise at training time"},
+    )
+    quant_noise_pq_block_size: int = field(
+        default=8,
+        metadata={"help": "block size of quantization noise at training time"},
+    )
+    # TODO common var add to parent
+    quant_noise_scalar: float = field(
+        default=0.0,
+        metadata={
+            "help": "scalar quantization noise and scalar quantization at training time"
+        },
+    )
+    add_bos_token: bool = II("task.add_bos_token")
+    tokens_per_sample: int = II("task.tokens_per_sample")
+    max_target_positions: Optional[int] = II("task.max_target_positions")
+    tpu: bool = II("common.tpu")
+    duration_prediction: str = II("task.duration_prediction")
+    delayed_duration_target: str = II("task.delayed_duration_target")
+    main_and_cross_weights: str = II("criterion.main_and_cross_weights")
+
+
+@register_model("speech_dlm", dataclass=SpeechDLMConfig)
+class SpeechDLM(FairseqLanguageModel):
+    """Spoken Unit-based Dialogue Language Model model (SpeechDLM) as described
+    in the paper: https://arxiv.org/pdf/2203.16502.pdf
+    """
+
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if args.decoder_cross_layers < 0:
+            args.decoder_cross_layers = args.decoder_layers
+
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = getattr(
+                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+            )
+
+        # Assert all dictionary to be the same
+        assert all(
+            task.source_dictionaries[channel] == task.source_dictionary
+            for channel in task.channels
+        ), "Source dictionaries of all channels are expected to be the same!!!"
+        assert all(
+            task.target_dictionaries[channel] == task.target_dictionary
+            for channel in task.channels
+        ), "Target dictionaries of all channels are expected to be the same!!!"
+        # Build the unit embeddings
+        embed_tokens = cls.build_embedding(
+            args, task.source_dictionary, args.decoder_input_dim
+        )
+
+        decoder = CrossChannelTransformerDecoder(
+            args,
+            task.target_dictionary,
+            embed_tokens,
+            channels=task.channels,
+            no_encoder_attn=True,
+        )
+        return cls(decoder)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
+        return embed_tokens
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        **kwargs,
+    ):
+        """
+        Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
+        file. Downloads and caches the pre-trained model file if needed.
+
+        The base implementation returns a
+        :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
+        generate translations or sample from language models. The underlying
+        :class:`~fairseq.models.FairseqModel` can be accessed via the
+        *generator.models* attribute.
+
+        This function return a class:`MultichannelGeneratorHubInterface` object,
+        which allows generation in multiple channels with a multichannel model.
+
+        Args:
+            model_name_or_path (str): either the name of a pre-trained model to
+                load or a path/URL to a pre-trained model state dict
+            checkpoint_file (str, optional): colon-separated list of checkpoint
+                files in the model archive to ensemble (default: 'model.pt')
+            data_name_or_path (str, optional): point args.data to the archive
+                at the given path/URL. Can start with '.' or './' to reuse the
+                model archive path.
+        """
+        from fairseq import hub_utils
+        from .hub_interface import MultichannelGeneratorHubInterface
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            **kwargs,
+        )
+        logger.info(x["args"])
+        return MultichannelGeneratorHubInterface(x["args"], x["task"], x["models"])
+
+    @property
+    def supported_targets(self):
+        return {"next", "edge", "duration"}
+
+
+def base_lm_architecture(args):
+    # backward compatibility for older model checkpoints
+    if hasattr(args, "decoder_final_norm"):
+        args.no_decoder_final_norm = not args.decoder_final_norm
+
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_cross_layers = getattr(args, "decoder_cross_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
+    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0)
+
+    args.add_bos_token = getattr(args, "add_bos_token", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    # Model training is not stable without this
+    args.decoder_normalize_before = True
+    args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.checkpoint_activations = getattr(args, "checkpoint_activations", False)
+    args.offload_activations = getattr(args, "offload_activations", False)
+    if args.offload_activations:
+        args.checkpoint_activations = True
+
+
+@register_model_architecture("speech_dlm", "speech_dlm_big")
+def speech_dlm_big(args):
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.decoder_cross_layers = getattr(args, "decoder_cross_layers", 12)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    base_lm_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
new file mode 100644
index 0000000000..f29215c2fe
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .s2s_conformer import *  # noqa
+from .s2s_conformer_translatotron2 import *  # noqa
+from .s2s_conformer_unity import *  # noqa
+from .s2s_transformer import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
new file mode 100644
index 0000000000..721efbf61a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import nn
+
+from fairseq.models import FairseqEncoder
+
+
+class CTCDecoder(FairseqEncoder):
+    def __init__(self, dictionary, in_dim):
+        super().__init__(dictionary)
+        self.proj = nn.Linear(in_dim, len(dictionary))
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        encoder_out = self.proj(src_tokens)
+        return {"encoder_out": encoder_out}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
new file mode 100644
index 0000000000..5955a08538
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+
+from fairseq.models.transformer import Linear
+
+
+class StackedEmbedding(nn.Embedding):
+    """Embedding module that supports stacked units -> single embedding"""
+
+    def __init__(self, num_embeddings, embed_dim, padding_idx, num_stacked=1):
+        super().__init__(num_embeddings, embed_dim, padding_idx)
+        # follow transformer.Embedding
+        nn.init.normal_(self.weight, mean=0, std=embed_dim**-0.5)
+        nn.init.constant_(self.weight[padding_idx], 0)
+
+        self.offset = (
+            4  # skip <bos>, <pad>, <eos>, <unk>, specific to fairseq dictionary
+        )
+        self.vocab_size = num_embeddings - self.offset
+        self.num_stacked = num_stacked
+
+        if self.num_stacked > 1:
+            self.project_in_dim = Linear(embed_dim * num_stacked, embed_dim, bias=False)
+
+    def forward(self, input):
+        if self.num_stacked == 1:
+            return super().forward(input)
+
+        # expand input indices
+        mask = input >= self.offset
+        stacked_input = []
+        cum_input = input.new_zeros(input.shape)
+        for i in range(1, self.num_stacked + 1):
+            div = pow(self.vocab_size, i)
+            next_input = torch.remainder(input - self.offset - cum_input, div)
+            cum_input += next_input
+            next_input = torch.floor_divide(next_input, div // self.vocab_size)
+            stacked_input.append((next_input + self.offset) * mask + input * ~mask)
+
+        stacked_input = torch.stack(stacked_input[::-1], dim=2)
+        embed = super().forward(stacked_input).view(input.size(0), input.size(1), -1)
+        embed = self.project_in_dim(embed)
+        return embed
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
new file mode 100644
index 0000000000..68f42c2b36
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, List, Optional
+
+from torch import Tensor
+
+from fairseq.models.transformer import Linear
+from fairseq.models.transformer.transformer_decoder_aug import AugTransformerDecoder
+
+
+class AugTransformerUnitDecoder(AugTransformerDecoder):
+    """Based on Transformer decoder, with support to decoding stacked units"""
+
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        output_projection=None,
+    ):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn, output_projection
+        )
+        self.n_frames_per_step = args.n_frames_per_step
+
+        self.out_proj_n_frames = (
+            Linear(
+                self.output_embed_dim,
+                self.output_embed_dim * self.n_frames_per_step,
+                bias=False,
+            )
+            if self.n_frames_per_step > 1
+            else None
+        )
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention, should be of size T x B x C
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            encoder_out_aug=encoder_out_aug,
+            incremental_state=incremental_state,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+
+        if not features_only:
+            bsz, seq_len, d = x.size()
+            if self.out_proj_n_frames:
+                x = self.out_proj_n_frames(x)
+            x = self.output_layer(x.view(bsz, seq_len, self.n_frames_per_step, d))
+            x = x.view(bsz, seq_len * self.n_frames_per_step, -1)
+            if (
+                incremental_state is None and self.n_frames_per_step > 1
+            ):  # teacher-forcing mode in training
+                x = x[
+                    :, : -(self.n_frames_per_step - 1), :
+                ]  # remove extra frames after <eos>
+
+        return x, extra
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        if self.n_frames_per_step > 1:
+            move_keys = [
+                (
+                    f"{name}.project_in_dim.weight",
+                    f"{name}.embed_tokens.project_in_dim.weight",
+                )
+            ]
+            for from_k, to_k in move_keys:
+                if from_k in state_dict and to_k not in state_dict:
+                    state_dict[to_k] = state_dict[from_k]
+                    del state_dict[from_k]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
new file mode 100644
index 0000000000..fb1af433d8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+
+from fairseq.models import FairseqEncoder
+from fairseq.modules import LayerNorm, TransformerEncoderLayer
+
+
+class TransformerEncoderNoEmb(FairseqEncoder):
+    """Transformer encoder without token embeddings."""
+
+    def __init__(self, args):
+        super().__init__(None)
+
+        self.layers = nn.ModuleList(
+            [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)]
+        )
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(args.encoder_embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(self, x, encoder_padding_mask, return_all_hiddens=False):
+
+        encoder_states = []
+
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask)
+            if return_all_hiddens:
+                encoder_states.append(x)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask]
+            if encoder_padding_mask is not None and encoder_padding_mask.any()
+            else [],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        new_encoder_out = (
+            []
+            if len(encoder_out["encoder_out"]) == 0
+            else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
+        )
+
+        new_encoder_padding_mask = (
+            []
+            if len(encoder_out["encoder_padding_mask"]) == 0
+            else [
+                x.index_select(0, new_order)
+                for x in encoder_out["encoder_padding_mask"]
+            ]
+        )
+
+        new_encoder_embedding = (
+            []
+            if len(encoder_out["encoder_embedding"]) == 0
+            else [
+                x.index_select(0, new_order) for x in encoder_out["encoder_embedding"]
+            ]
+        )
+
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],  # B x T
+            "src_lengths": [],  # B x 1
+        }
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
new file mode 100644
index 0000000000..636396d536
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from pathlib import Path
+
+import torch
+
+from fairseq import checkpoint_utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.speech_to_speech.s2s_transformer import (
+    S2SpecTTransformerModel,
+    S2UTTransformerModel,
+    s2spect_architecture_base,
+    s2ut_architecture_base,
+)
+from fairseq.models.speech_to_text import S2TConformerEncoder
+from fairseq.models.transformer import Linear
+
+logger = logging.getLogger(__name__)
+
+
+def build_s2s_conformer_encoder(args):
+    encoder = S2SConformerEncoder(args)
+    pretraining_path = getattr(args, "load_pretrained_encoder_from", None)
+    if pretraining_path is not None:
+        if not Path(pretraining_path).exists():
+            logger.warning(
+                f"skipped pretraining because {pretraining_path} does not exist"
+            )
+        else:
+            encoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=encoder, checkpoint=pretraining_path
+            )
+            logger.info(f"loaded pretrained encoder from: {pretraining_path}")
+    return encoder
+
+
+class S2SConformerEncoder(S2TConformerEncoder):
+    """Based on S2T transformer encoder, with support
+    to incorporate target speaker embedding."""
+
+    def __init__(self, args):
+        super().__init__(args)
+
+        self.spk_emb_proj = None
+        if args.target_speaker_embed:
+            self.spk_emb_proj = Linear(
+                args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim
+            )
+
+    def forward(
+        self, src_tokens, src_lengths, tgt_speaker=None, return_all_hiddens=False
+    ):
+        out = super().forward(src_tokens, src_lengths, return_all_hiddens)
+
+        if self.spk_emb_proj:
+            x = out["encoder_out"][0]
+            seq_len, bsz, _ = x.size()
+            tgt_speaker_emb = tgt_speaker.view(1, bsz, -1).expand(seq_len, bsz, -1)
+            x = self.spk_emb_proj(torch.cat([x, tgt_speaker_emb], dim=2))
+            out["encoder_out"][0] = x
+
+        return out
+
+
+@register_model("s2ut_conformer")
+class S2UTConformerModel(S2UTTransformerModel):
+    """
+    Direct speech-to-speech translation model with Conformer encoder + Transformer discrete unit decoder
+    """
+
+    @staticmethod
+    def add_args(parser):
+        S2UTTransformerModel.add_args(parser)
+        parser.add_argument(
+            "--depthwise-conv-kernel-size",
+            type=int,
+            metavar="N",
+            help="kernel size of depthwise convolution layers",
+        )
+        parser.add_argument(
+            "--attn-type",
+            type=str,
+            metavar="STR",
+            help="If not specified uses fairseq MHA. Other valid option is espnet for using conformer",
+        )
+        parser.add_argument(
+            "--pos-enc-type",
+            type=str,
+            metavar="STR",
+            help="Must be specified in addition to attn-type=espnet for rel_pos and rope",
+        )
+
+    @classmethod
+    def build_encoder(cls, args):
+        return build_s2s_conformer_encoder(args)
+
+
+@register_model("s2spect_conformer")
+class S2SpecTConformerModel(S2SpecTTransformerModel):
+    """
+    Direct speech-to-speech translation model with Conformer encoder + TTS Transformer decoder
+    """
+
+    @staticmethod
+    def add_args(parser):
+        S2SpecTTransformerModel.add_args(parser)
+        parser.add_argument("--depthwise-conv-kernel-size", type=int, default=31)
+        parser.add_argument(
+            "--attn-type",
+            type=str,
+            default=None,
+            help="If not specified uses fairseq MHA. Other valid option is espnet for using conformer",
+        )
+        parser.add_argument(
+            "--pos-enc-type",
+            type=str,
+            default="abs",
+            help="Must be specified in addition to attn-type=espnet for rel_pos and rope",
+        )
+
+    @classmethod
+    def build_encoder(cls, args):
+        return build_s2s_conformer_encoder(args)
+
+
+@register_model_architecture("s2ut_conformer", "s2ut_conformer")
+def s2ut_conformer_architecture_base(args):
+    args.attn_type = getattr(args, "attn_type", None)
+    args.pos_enc_type = getattr(args, "pos_enc_type", "abs")
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.input_channels = getattr(args, "input_channels", 1)
+    args.max_source_positions = getattr(args, "max_source_positions", 6000)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31)
+    s2ut_architecture_base(args)
+
+
+@register_model_architecture("s2spect_conformer", "s2spect_conformer")
+def s2spect_conformer_architecture_base(args):
+    args.attn_type = getattr(args, "attn_type", None)
+    args.pos_enc_type = getattr(args, "pos_enc_type", "abs")
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.input_channels = getattr(args, "input_channels", 1)
+    args.max_source_positions = getattr(args, "max_source_positions", 6000)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31)
+    s2spect_architecture_base(args)
+
+
+@register_model_architecture("s2spect_conformer", "s2spect_conformer_fisher")
+def s2spect_architecture_fisher(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+
+    # decoder
+    args.prenet_dim = getattr(args, "prenet_dim", 32)
+
+    s2spect_conformer_architecture_base(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
new file mode 100644
index 0000000000..8016daee8d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
@@ -0,0 +1,262 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+
+from fairseq.models import (
+    FairseqEncoderModel,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder
+from fairseq.models.speech_to_speech.modules.transformer_encoder import (
+    TransformerEncoderNoEmb,
+)
+from fairseq.models.speech_to_speech.s2s_conformer import S2SpecTConformerModel
+from fairseq.models.speech_to_speech.s2s_conformer_unity import (
+    multitask_text_transformer_decoder_arch,
+)
+from fairseq.models.speech_to_speech.s2s_transformer import (
+    base_multitask_text_transformer_decoder_arch,
+    s2spect_architecture_base,
+)
+from fairseq.models.text_to_speech import TTSTransformerDecoder
+from fairseq.models.transformer import TransformerDecoder, TransformerModelBase
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("s2spect2_conformer")
+class S2SpecT2ConformerModel(S2SpecTConformerModel):
+    """
+    Direct speech-to-speech translation model with Conformer encoder + MT Transformer decoder + TTS Transformer decoder
+    """
+
+    @staticmethod
+    def add_args(parser):
+        S2SpecTConformerModel.add_args(parser)
+        parser.add_argument(
+            "--translation-decoder-layers",
+            type=int,
+            default=4,
+            metavar="N",
+            help="num decoder layers in the first-pass translation module",
+        )
+        parser.add_argument(
+            "--synthesizer",
+            default="transformer",
+            choices=["transformer"],
+            help="",
+        )
+        parser.add_argument(
+            "--synthesizer-encoder-layers",
+            type=int,
+            default=0,
+            metavar="N",
+            help="num encoder layers in the second-pass synthesizer module",
+        )
+
+    @classmethod
+    def build_multitask_decoder(
+        cls,
+        args,
+        tgt_dict,
+        in_dim,
+        is_mt_decoder,
+        decoder_layers,
+        decoder_embed_dim,
+        decoder_attention_heads,
+    ):
+        decoder_args = args.decoder_args
+        decoder_args.encoder_embed_dim = in_dim
+        if args.decoder_type == "transformer":
+            if is_mt_decoder:
+                multitask_text_transformer_decoder_arch(
+                    decoder_args,
+                    decoder_layers,
+                    decoder_embed_dim,
+                    decoder_attention_heads,
+                )  # 4L
+            else:
+                base_multitask_text_transformer_decoder_arch(decoder_args)  # 2L
+            task_decoder = TransformerDecoder(
+                decoder_args,
+                tgt_dict,
+                embed_tokens=TransformerModelBase.build_embedding(
+                    decoder_args,
+                    tgt_dict,
+                    decoder_args.decoder_embed_dim,
+                ),
+            )
+        elif args.decoder_type == "ctc":
+            task_decoder = CTCDecoder(
+                dictionary=tgt_dict,
+                in_dim=in_dim,
+            )
+        else:
+            raise NotImplementedError(
+                "currently only support multitask decoder_type 'transformer', 'ctc'"
+            )
+
+        return task_decoder
+
+    @classmethod
+    def build_decoder(cls, args):
+        _args = copy.deepcopy(args)
+        _args.encoder_embed_dim = args.decoder_embed_dim
+
+        if args.synthesizer == "transformer":
+            return TTSTransformerDecoder(_args, None, padding_idx=1)
+        else:
+            raise NotImplementedError(args.synthesizer)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = cls.build_encoder(args)
+        decoder = cls.build_decoder(args)
+        base_model = cls(encoder, decoder)
+
+        # set up multitask decoders
+        base_model.mt_task_name = None
+        base_model.multitask_decoders = {}
+        has_first_pass_decoder = False
+        for task_name, task_obj in task.multitask_tasks.items():
+            if task_obj.is_first_pass_decoder:
+                has_first_pass_decoder = True
+                base_model.mt_task_name = task_name
+
+            in_dim = (
+                args.encoder_embed_dim
+                if task_obj.args.input_from == "encoder"
+                else args.decoder_embed_dim
+            )
+            task_decoder = cls.build_multitask_decoder(
+                task_obj.args,
+                task_obj.target_dictionary,
+                in_dim,
+                task_obj.is_first_pass_decoder,
+                getattr(args, "translation_decoder_layers", 4),
+                getattr(args, "decoder_embed_dim", 256),
+                getattr(args, "decoder_attention_heads", 4),
+            )
+
+            setattr(base_model, f"{task_name}_decoder", task_decoder)
+            decoder_model_cls = (
+                FairseqEncoderModel
+                if task_obj.args.decoder_type == "ctc"
+                else FairseqLanguageModel
+            )
+            base_model.multitask_decoders[task_name] = decoder_model_cls(
+                getattr(base_model, f"{task_name}_decoder")
+            )
+
+        assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder"
+
+        # set up encoder on top of the auxiliary MT decoder
+        if getattr(args, "synthesizer_encoder_layers", 0) > 0:
+            base_model.synthesizer_encoder = cls.build_text_encoder(args)
+        else:
+            base_model.synthesizer_encoder = None
+
+        return base_model
+
+    @classmethod
+    def build_text_encoder(cls, args):
+        _args = copy.deepcopy(args)
+        _args.encoder_layers = args.synthesizer_encoder_layers
+        _args.encoder_embed_dim = args.decoder_embed_dim
+        _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim
+        _args.encoder_attention_heads = args.decoder_attention_heads
+        _args.encoder_normalize_before = True
+        return TransformerEncoderNoEmb(_args)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        prev_output_tokens_mt,
+        tgt_speaker=None,
+        incremental_state=None,
+        target_lengths=None,
+        speaker=None,
+        return_all_hiddens=False,
+    ):
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            tgt_speaker=tgt_speaker,
+            return_all_hiddens=return_all_hiddens,
+        )
+
+        # 1. MT decoder
+        mt_decoder = getattr(self, f"{self.mt_task_name}_decoder")
+        mt_decoder_out = mt_decoder(
+            prev_output_tokens_mt,
+            encoder_out=encoder_out,
+        )
+        x = mt_decoder_out[1]["inner_states"][-1]
+        if mt_decoder.layer_norm is not None:
+            x = mt_decoder.layer_norm(x)
+
+        mt_decoder_padding_mask = None
+        if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any():
+            mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx)
+
+        # 2. TTS encoder
+        if self.synthesizer_encoder is not None:
+            tts_encoder_out = self.synthesizer_encoder(
+                x,
+                mt_decoder_padding_mask,
+                return_all_hiddens=return_all_hiddens,
+            )
+        else:
+            tts_encoder_out = {
+                "encoder_out": [x],  # T x B x C
+                "encoder_padding_mask": [mt_decoder_padding_mask],  # B x T
+            }
+
+        # 3. TTS decoder
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=tts_encoder_out,
+            incremental_state=incremental_state,
+            target_lengths=target_lengths,
+            speaker=speaker,
+        )
+        if return_all_hiddens:
+            decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"]
+            decoder_out[-1]["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ]
+        decoder_out[-1]["mt_decoder_out"] = mt_decoder_out
+        return decoder_out
+
+
+@register_model_architecture(
+    model_name="s2spect2_conformer", arch_name="s2spect2_conformer"
+)
+def s2spect2_conformer_architecture_base(args):
+    args.conv_version = getattr(args, "conv_version", "convtransformer")
+    args.attn_type = getattr(args, "attn_type", None)
+    args.pos_enc_type = getattr(args, "pos_enc_type", "abs")
+    args.max_source_positions = getattr(args, "max_source_positions", 6000)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31)
+    s2spect_architecture_base(args)
+
+
+# for old naming
+@register_model_architecture(
+    model_name="s2spect2_conformer", arch_name="s2spect_conformer_translatotron2"
+)
+def s2spect2_conformer_architecture_base_legacy(args):
+    s2spect2_conformer_architecture_base(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
new file mode 100644
index 0000000000..64388d6d16
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
@@ -0,0 +1,298 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder
+from fairseq.models.speech_to_speech.modules.stacked_embedding import StackedEmbedding
+from fairseq.models.speech_to_speech.modules.transformer_decoder_aug import (
+    AugTransformerUnitDecoder,
+)
+from fairseq.models.speech_to_speech.modules.transformer_encoder import (
+    TransformerEncoderNoEmb,
+)
+from fairseq.models.speech_to_speech.s2s_conformer import S2UTConformerModel
+from fairseq.models.speech_to_speech.s2s_transformer import (
+    TransformerUnitDecoder,
+    base_multitask_text_transformer_decoder_arch,
+    s2ut_architecture_base,
+)
+from fairseq.models.transformer import TransformerDecoder, TransformerModelBase
+
+logger = logging.getLogger(__name__)
+
+
+def multitask_text_transformer_decoder_arch(
+    args, decoder_layers, decoder_embed_dim=256, decoder_attention_heads=4
+):
+    args.decoder_layers = decoder_layers
+    args.decoder_embed_dim = decoder_embed_dim
+    args.decoder_attention_heads = decoder_attention_heads
+    base_multitask_text_transformer_decoder_arch(args)
+
+
+@register_model("unity_conformer")
+class UnityConformerModel(S2UTConformerModel):
+    """
+    Direct speech-to-speech translation model with Conformer encoder + MT Transformer decoder + Transformer discrete unit decoder
+    """
+
+    @staticmethod
+    def add_args(parser):
+        S2UTConformerModel.add_args(parser)
+        parser.add_argument(
+            "--translation-decoder-layers",
+            type=int,
+            default=4,
+            metavar="N",
+            help="num decoder layers in the first-pass translation module",
+        )
+        parser.add_argument(
+            "--synthesizer",
+            default="transformer",
+            choices=["transformer"],
+            help="",
+        )
+        parser.add_argument(
+            "--synthesizer-encoder-layers",
+            type=int,
+            default=0,
+            metavar="N",
+            help="num encoder layers in the second-pass synthesizer module",
+        )
+        parser.add_argument(
+            "--synthesizer-augmented-cross-attention",
+            action="store_true",
+            default=False,
+            help="augmented cross-attention over speech encoder output",
+        )
+
+    @classmethod
+    def build_multitask_decoder(
+        cls,
+        args,
+        tgt_dict,
+        in_dim,
+        is_first_pass_decoder,
+        decoder_layers,
+        decoder_embed_dim,
+        decoder_attention_heads,
+    ):
+        decoder_args = args.decoder_args
+        decoder_args.encoder_embed_dim = in_dim
+        if args.decoder_type == "transformer":
+            if is_first_pass_decoder:
+                multitask_text_transformer_decoder_arch(
+                    decoder_args,
+                    decoder_layers,
+                    decoder_embed_dim,
+                    decoder_attention_heads,
+                )  # 4L
+            else:
+                base_multitask_text_transformer_decoder_arch(decoder_args)  # 2L
+            task_decoder = TransformerDecoder(
+                decoder_args,
+                tgt_dict,
+                embed_tokens=TransformerModelBase.build_embedding(
+                    decoder_args,
+                    tgt_dict,
+                    decoder_args.decoder_embed_dim,
+                ),
+            )
+        elif args.decoder_type == "ctc":
+            task_decoder = CTCDecoder(
+                dictionary=tgt_dict,
+                in_dim=in_dim,
+            )
+        else:
+            raise NotImplementedError(
+                "currently only support multitask decoder_type 'transformer', 'ctc'"
+            )
+
+        return task_decoder
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, aug_attn=False):
+        num_embeddings = len(tgt_dict)
+        padding_idx = tgt_dict.pad()
+        embed_tokens = StackedEmbedding(
+            num_embeddings,
+            args.decoder_embed_dim,
+            padding_idx,
+            num_stacked=args.n_frames_per_step,
+        )
+
+        _args = copy.deepcopy(args)
+        _args.encoder_embed_dim = args.decoder_embed_dim
+
+        decoder_cls = AugTransformerUnitDecoder if aug_attn else TransformerUnitDecoder
+        return decoder_cls(
+            _args,
+            tgt_dict,
+            embed_tokens,
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = cls.build_encoder(args)
+        decoder = cls.build_decoder(
+            args,
+            task.target_dictionary,
+            aug_attn=getattr(args, "synthesizer_augmented_cross_attention", False),
+        )
+        base_model = cls(encoder, decoder)
+
+        base_model.t2u_augmented_cross_attn = getattr(
+            args, "synthesizer_augmented_cross_attention", False
+        )
+
+        # set up multitask decoders
+        base_model.mt_task_name = None
+        base_model.multitask_decoders = {}
+        has_first_pass_decoder = False
+        for task_name, task_obj in task.multitask_tasks.items():
+            if task_obj.is_first_pass_decoder:
+                has_first_pass_decoder = True
+                base_model.mt_task_name = task_name
+
+            in_dim = (
+                args.encoder_embed_dim
+                if task_obj.args.input_from == "encoder"
+                else args.decoder_embed_dim
+            )
+            task_decoder = cls.build_multitask_decoder(
+                task_obj.args,
+                task_obj.target_dictionary,
+                in_dim,
+                task_obj.is_first_pass_decoder,
+                getattr(args, "translation_decoder_layers", 4),
+                getattr(args, "decoder_embed_dim", 256),
+                getattr(args, "decoder_attention_heads", 4),
+            )
+
+            setattr(base_model, f"{task_name}_decoder", task_decoder)
+            decoder_model_cls = (
+                FairseqEncoderModel
+                if task_obj.args.decoder_type == "ctc"
+                else FairseqLanguageModel
+            )
+            base_model.multitask_decoders[task_name] = decoder_model_cls(
+                getattr(base_model, f"{task_name}_decoder")
+            )
+
+        assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder"
+
+        # set up encoder on top of the auxiliary MT decoder
+        if getattr(args, "synthesizer_encoder_layers", 0) > 0:
+            base_model.synthesizer_encoder = cls.build_text_encoder(args)
+        else:
+            base_model.synthesizer_encoder = None
+
+        return base_model
+
+    @classmethod
+    def build_text_encoder(cls, args):
+        _args = copy.deepcopy(args)
+        _args.encoder_layers = args.synthesizer_encoder_layers
+        _args.encoder_embed_dim = args.decoder_embed_dim
+        _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim
+        _args.encoder_attention_heads = args.decoder_attention_heads
+        _args.encoder_normalize_before = True
+        return TransformerEncoderNoEmb(_args)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        prev_output_tokens_mt,
+        tgt_speaker=None,
+        return_all_hiddens=False,
+    ):
+        mt_decoder = getattr(self, f"{self.mt_task_name}_decoder")
+
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            tgt_speaker=tgt_speaker,
+            return_all_hiddens=return_all_hiddens,
+        )
+
+        # 1. MT decoder
+        mt_decoder_out = mt_decoder(
+            prev_output_tokens_mt,
+            encoder_out=encoder_out,
+        )
+        x = mt_decoder_out[1]["inner_states"][-1]
+        if mt_decoder.layer_norm is not None:
+            x = mt_decoder.layer_norm(x)
+
+        mt_decoder_padding_mask = None
+        if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any():
+            mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx)
+
+        # 2. T2U encoder
+        if self.synthesizer_encoder is not None:
+            t2u_encoder_out = self.synthesizer_encoder(
+                x,
+                mt_decoder_padding_mask,
+                return_all_hiddens=return_all_hiddens,
+            )
+        else:
+            t2u_encoder_out = {
+                "encoder_out": [x],  # T x B x C
+                "encoder_padding_mask": [mt_decoder_padding_mask],  # B x T
+            }
+
+        # 3. T2U decoder
+        if self.t2u_augmented_cross_attn:
+            decoder_out = self.decoder(
+                prev_output_tokens,
+                encoder_out=encoder_out,
+                encoder_out_aug=t2u_encoder_out,
+            )
+        else:
+            decoder_out = self.decoder(
+                prev_output_tokens,
+                encoder_out=t2u_encoder_out,
+            )
+        if return_all_hiddens:
+            decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"]
+            decoder_out[-1]["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ]
+        decoder_out[-1]["mt_decoder_out"] = mt_decoder_out
+        return decoder_out
+
+
+@register_model_architecture(model_name="unity_conformer", arch_name="unity_conformer")
+def unity_conformer_architecture_base(args):
+    args.conv_version = getattr(args, "conv_version", "convtransformer")
+    args.attn_type = getattr(args, "attn_type", None)
+    args.pos_enc_type = getattr(args, "pos_enc_type", "abs")
+    args.max_source_positions = getattr(args, "max_source_positions", 6000)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31)
+    s2ut_architecture_base(args)
+
+
+# for old naming
+@register_model_architecture(
+    model_name="unity_conformer", arch_name="s2ut_conformer_translatotron2"
+)
+def unity_conformer_architecture_base_legacy(args):
+    unity_conformer_architecture_base(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
new file mode 100644
index 0000000000..07393d2598
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
@@ -0,0 +1,722 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch import Tensor
+
+from fairseq import checkpoint_utils, utils
+from fairseq.models import (
+    FairseqEncoderDecoderModel,
+    FairseqEncoderModel,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder
+from fairseq.models.speech_to_speech.modules.stacked_embedding import StackedEmbedding
+from fairseq.models.speech_to_text import S2TTransformerEncoder
+from fairseq.models.text_to_speech import TTSTransformerDecoder
+from fairseq.models.transformer import Linear, TransformerDecoder, TransformerModelBase
+
+logger = logging.getLogger(__name__)
+
+
+class S2STransformerEncoder(S2TTransformerEncoder):
+    """Based on S2T transformer encoder, with support
+    to incorporate target speaker embedding."""
+
+    def __init__(self, args):
+        super().__init__(args)
+
+        self.spk_emb_proj = None
+        if args.target_speaker_embed:
+            self.spk_emb_proj = Linear(
+                args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim
+            )
+
+    def forward(
+        self, src_tokens, src_lengths, tgt_speaker=None, return_all_hiddens=False
+    ):
+        out = super().forward(src_tokens, src_lengths, return_all_hiddens)
+
+        if self.spk_emb_proj:
+            x = out["encoder_out"][0]
+            seq_len, bsz, _ = x.size()
+            tgt_speaker_emb = tgt_speaker.view(1, bsz, -1).expand(seq_len, bsz, -1)
+            x = self.spk_emb_proj(torch.cat([x, tgt_speaker_emb], dim=2))
+            out["encoder_out"][0] = x
+
+        return out
+
+
+class TransformerUnitDecoder(TransformerDecoder):
+    """Based on Transformer decoder, with support to decoding stacked units"""
+
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        output_projection=None,
+    ):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn, output_projection
+        )
+        self.n_frames_per_step = args.n_frames_per_step
+
+        self.out_proj_n_frames = (
+            Linear(
+                self.output_embed_dim,
+                self.output_embed_dim * self.n_frames_per_step,
+                bias=False,
+            )
+            if self.n_frames_per_step > 1
+            else None
+        )
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention, should be of size T x B x C
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+
+        if not features_only:
+            bsz, seq_len, d = x.size()
+            if self.out_proj_n_frames:
+                x = self.out_proj_n_frames(x)
+            x = self.output_layer(x.view(bsz, seq_len, self.n_frames_per_step, d))
+            x = x.view(bsz, seq_len * self.n_frames_per_step, -1)
+            if (
+                incremental_state is None and self.n_frames_per_step > 1
+            ):  # teacher-forcing mode in training
+                x = x[
+                    :, : -(self.n_frames_per_step - 1), :
+                ]  # remove extra frames after <eos>
+
+        return x, extra
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        if self.n_frames_per_step > 1:
+            move_keys = [
+                (
+                    f"{name}.project_in_dim.weight",
+                    f"{name}.embed_tokens.project_in_dim.weight",
+                )
+            ]
+            for from_k, to_k in move_keys:
+                if from_k in state_dict and to_k not in state_dict:
+                    state_dict[to_k] = state_dict[from_k]
+                    del state_dict[from_k]
+
+
+class S2STransformerMultitaskModelBase(FairseqEncoderDecoderModel):
+    @classmethod
+    def build_encoder(cls, args):
+        encoder = S2STransformerEncoder(args)
+        pretraining_path = getattr(args, "load_pretrained_encoder_from", None)
+        if pretraining_path is not None:
+            if not Path(pretraining_path).exists():
+                logger.warning(
+                    f"skipped pretraining because {pretraining_path} does not exist"
+                )
+            else:
+                encoder = checkpoint_utils.load_pretrained_component_from_model(
+                    component=encoder, checkpoint=pretraining_path
+                )
+                logger.info(f"loaded pretrained encoder from: {pretraining_path}")
+        return encoder
+
+    @classmethod
+    def build_multitask_decoder(cls, args, tgt_dict, in_dim):
+        decoder_args = args.decoder_args
+        decoder_args.encoder_embed_dim = in_dim
+        if args.decoder_type == "transformer":
+            base_multitask_text_transformer_decoder_arch(decoder_args)
+            task_decoder = TransformerDecoder(
+                decoder_args,
+                tgt_dict,
+                embed_tokens=TransformerModelBase.build_embedding(
+                    decoder_args,
+                    tgt_dict,
+                    decoder_args.decoder_embed_dim,
+                ),
+            )
+        elif args.decoder_type == "ctc":
+            task_decoder = CTCDecoder(
+                dictionary=tgt_dict,
+                in_dim=in_dim,
+            )
+        else:
+            raise NotImplementedError(
+                "currently only support multitask decoder_type 'transformer', 'ctc'"
+            )
+
+        return task_decoder
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = cls.build_encoder(args)
+        decoder = (
+            cls.build_decoder(args, task.target_dictionary)
+            if task.args.target_is_code
+            else cls.build_decoder(args)
+        )
+        base_model = cls(encoder, decoder)
+
+        # set up multitask decoders
+        base_model.multitask_decoders = {}
+        for task_name, task_obj in task.multitask_tasks.items():
+            in_dim = (
+                args.encoder_embed_dim
+                if task_obj.args.input_from == "encoder"
+                else args.decoder_embed_dim
+            )
+            task_decoder = cls.build_multitask_decoder(
+                task_obj.args, task_obj.target_dictionary, in_dim
+            )
+
+            setattr(base_model, f"{task_name}_decoder", task_decoder)
+            decoder_model_cls = (
+                FairseqEncoderModel
+                if task_obj.args.decoder_type == "ctc"
+                else FairseqLanguageModel
+            )
+            base_model.multitask_decoders[task_name] = decoder_model_cls(
+                getattr(base_model, f"{task_name}_decoder")
+            )
+
+        return base_model
+
+    def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs):
+        return self.encoder(
+            src_tokens, src_lengths=src_lengths, tgt_speaker=speaker, **kwargs
+        )
+
+
+@register_model("s2ut_transformer")
+class S2UTTransformerModel(S2STransformerMultitaskModelBase):
+    """
+    Direct speech-to-speech translation model with Transformer encoder + Transformer discrete unit decoder
+    https://arxiv.org/abs/2107.05604
+    """
+
+    @staticmethod
+    def add_args(parser):
+        # input
+        parser.add_argument(
+            "--conv-kernel-sizes",
+            type=str,
+            metavar="STR",
+            help="kernel sizes of Conv1d (s2t_transformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-channels",
+            type=int,
+            metavar="N",
+            help="# of channels in Conv1d (s2t_transformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-out-channels",
+            type=int,
+            metavar="N",
+            help="# of channels in Conv2d (convtransformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-version",
+            type=str,
+            default="s2t_transformer",
+            choices=["s2t_transformer", "convtransformer"],
+            help="version of frontend convolutional layers",
+        )
+        # Transformer
+        parser.add_argument(
+            "--activation-fn",
+            type=str,
+            default="relu",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--activation-dropout",
+            "--relu-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN.",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--share-decoder-input-output-embed",
+            action="store_true",
+            help="share decoder input and output embeddings",
+        )
+        parser.add_argument(
+            "--layernorm-embedding",
+            action="store_true",
+            help="add layernorm to embedding",
+        )
+        parser.add_argument(
+            "--no-scale-embedding",
+            action="store_true",
+            help="if True, dont scale embeddings",
+        )
+        parser.add_argument(
+            "--load-pretrained-encoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take encoder weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--encoder-freezing-updates",
+            type=int,
+            metavar="N",
+            help="freeze encoder for first N updates",
+        )
+        # speaker
+        parser.add_argument(
+            "--speaker-embed-dim",
+            type=int,
+            metavar="N",
+            help="speaker embedding dimension",
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict):
+        num_embeddings = len(tgt_dict)
+        padding_idx = tgt_dict.pad()
+        embed_tokens = StackedEmbedding(
+            num_embeddings,
+            args.decoder_embed_dim,
+            padding_idx,
+            num_stacked=args.n_frames_per_step,
+        )
+
+        return TransformerUnitDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+        )
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        tgt_speaker=None,
+        return_all_hiddens=False,
+    ):
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            tgt_speaker=tgt_speaker,
+            return_all_hiddens=return_all_hiddens,
+        )
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+        )
+        if return_all_hiddens:
+            decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"]
+            decoder_out[-1]["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ]
+        return decoder_out
+
+
+@register_model("s2spect_transformer")
+class S2SpecTTransformerModel(S2STransformerMultitaskModelBase):
+    """
+    Speech-to-spectrogram model with S2T Transformer encoder + TTS Transformer decoder
+    """
+
+    @staticmethod
+    def add_args(parser):
+        # input
+        parser.add_argument(
+            "--conv-kernel-sizes",
+            type=str,
+            metavar="STR",
+            help="kernel sizes of Conv1d (s2t_transformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-channels",
+            type=int,
+            metavar="N",
+            help="# of channels in Conv1d (s2t_transformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-version",
+            type=str,
+            default="s2t_transformer",
+            choices=["s2t_transformer", "convtransformer"],
+            help="version of frontend convolutional layers",
+        )
+        # Transformer
+        parser.add_argument(
+            "--activation-fn",
+            type=str,
+            default="relu",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--activation-dropout",
+            "--relu-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN.",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--no-scale-embedding",
+            action="store_true",
+            help="if True, dont scale embeddings",
+        )
+        parser.add_argument(
+            "--load-pretrained-encoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take encoder weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--encoder-freezing-updates",
+            type=int,
+            metavar="N",
+            help="freeze encoder for first N updates",
+        )
+        # speaker
+        parser.add_argument(
+            "--speaker-embed-dim",
+            type=int,
+            metavar="N",
+            help="speaker embedding dimension",
+        )
+        # decoder
+        parser.add_argument("--output-frame-dim", type=int)
+        # decoder prenet
+        parser.add_argument("--prenet-dropout", type=float)
+        parser.add_argument("--prenet-layers", type=int)
+        parser.add_argument("--prenet-dim", type=int)
+        # decoder postnet
+        parser.add_argument("--postnet-dropout", type=float)
+        parser.add_argument("--postnet-layers", type=int)
+        parser.add_argument("--postnet-conv-dim", type=int)
+        parser.add_argument("--postnet-conv-kernel-size", type=int)
+        # decoder transformer layers
+        parser.add_argument("--decoder-transformer-layers", type=int)
+        parser.add_argument("--decoder-embed-dim", type=int)
+        parser.add_argument("--decoder-ffn-embed-dim", type=int)
+        parser.add_argument("--decoder-normalize-before", action="store_true")
+        parser.add_argument("--decoder-attention-heads", type=int)
+
+    @classmethod
+    def build_decoder(cls, args):
+        return TTSTransformerDecoder(args, None, padding_idx=1)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        tgt_speaker=None,
+        incremental_state=None,
+        target_lengths=None,
+        speaker=None,
+        return_all_hiddens=False,
+    ):
+        encoder_out = self.encoder(
+            src_tokens,
+            src_lengths=src_lengths,
+            tgt_speaker=tgt_speaker,
+            return_all_hiddens=return_all_hiddens,
+        )
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            target_lengths=target_lengths,
+            speaker=speaker,
+        )
+        if return_all_hiddens:
+            decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"]
+            decoder_out[-1]["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ]
+        return decoder_out
+
+
+def base_multitask_text_transformer_decoder_arch(args):
+    args.dropout = getattr(args, "dropout", 0.3)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", True
+    )
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.max_target_positions = getattr(args, "max_target_positions", 1024)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+
+    args.decoder_layers = getattr(args, "decoder_layers", 2)
+
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+
+    # decoder layer
+    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
+
+    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+
+
+def base_s2st_transformer_encoder_architecture(args):
+    args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0)
+
+    # Convolutional subsampler
+    args.input_channels = getattr(args, "input_channels", 1)
+    args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5")  # for Conv1d
+    args.conv_channels = getattr(args, "conv_channels", 1024)  # for Conv1d
+    args.conv_out_channels = getattr(args, "conv_out_channels", 256)  # for Conv2d
+    args.conv_version = getattr(args, "conv_version", "s2t_transformer")
+    # Transformer
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
+    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+
+    args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 256)
+
+
+@register_model_architecture(
+    model_name="s2ut_transformer", arch_name="s2ut_transformer"
+)
+def s2ut_architecture_base(args):
+    base_s2st_transformer_encoder_architecture(args)
+
+    # decoder
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+
+
+@register_model_architecture("s2ut_transformer", "s2ut_transformer_fisher")
+def s2ut_architecture_fisher(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+
+    s2ut_architecture_base(args)
+
+
+@register_model_architecture(
+    model_name="s2spect_transformer", arch_name="s2spect_transformer"
+)
+def s2spect_architecture_base(args):
+    base_s2st_transformer_encoder_architecture(args)
+
+    # decoder
+    args.output_frame_dim = getattr(args, "output_frame_dim", 80)
+    # decoder prenet
+    args.prenet_dropout = getattr(args, "prenet_dropout", 0.5)
+    args.prenet_layers = getattr(args, "prenet_layers", 2)
+    args.prenet_dim = getattr(args, "prenet_dim", 256)
+    # decoder postnet
+    args.postnet_dropout = getattr(args, "postnet_dropout", 0.5)
+    args.postnet_layers = getattr(args, "postnet_layers", 5)
+    args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512)
+    args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5)
+    # decoder transformer layers
+    args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim
+    )
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+
+
+@register_model_architecture("s2spect_transformer", "s2spect_transformer_fisher")
+def s2spect_architecture_fisher(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+
+    # decoder
+    args.prenet_dim = getattr(args, "prenet_dim", 32)
+
+    s2spect_architecture_base(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
new file mode 100644
index 0000000000..62ef663efb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .berard import *  # noqa
+from .convtransformer import *  # noqa
+from .multi_modality_model import *  # noqa
+from .s2t_conformer import *  # noqa
+from .s2t_transformer import *  # noqa
+from .s2t_wav_transformer import *  # noqa
+from .xm_transformer import *  # noqa
+from .xm_transformer_unity import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py
new file mode 100644
index 0000000000..107ac983c6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py
@@ -0,0 +1,607 @@
+#!/usr/bin/env python3
+
+from ast import literal_eval
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import checkpoint_utils, utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+
+
+@register_model("s2t_berard")
+class BerardModel(FairseqEncoderDecoderModel):
+    """Implementation of a model similar to https://arxiv.org/abs/1802.04200
+
+    Paper title: End-to-End Automatic Speech Translation of Audiobooks
+    An implementation is available in tensorflow at
+    https://github.com/eske/seq2seq
+    Relevant files in this implementation are the config
+    (https://github.com/eske/seq2seq/blob/master/config/LibriSpeech/AST.yaml)
+    and the model code
+    (https://github.com/eske/seq2seq/blob/master/translate/models.py).
+    The encoder and decoder try to be close to the original implementation.
+    The attention is an MLP as in Bahdanau et al.
+    (https://arxiv.org/abs/1409.0473).
+    There is no state initialization by averaging the encoder outputs.
+    """
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument(
+            "--input-layers",
+            type=str,
+            metavar="EXPR",
+            help="List of linear layer dimensions. These "
+            "layers are applied to the input features and "
+            "are followed by tanh and possibly dropout.",
+        )
+        parser.add_argument(
+            "--dropout",
+            type=float,
+            metavar="D",
+            help="Dropout probability to use in the encoder/decoder. "
+            "Note that this parameters control dropout in various places, "
+            "there is no fine-grained control for dropout for embeddings "
+            "vs LSTM layers for example.",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="Number of encoder input channels. " "Typically value is 1.",
+        )
+        parser.add_argument(
+            "--conv-layers",
+            type=str,
+            metavar="EXPR",
+            help="List of conv layers " "(format: (channels, kernel, stride)).",
+        )
+        parser.add_argument(
+            "--num-blstm-layers",
+            type=int,
+            metavar="N",
+            help="Number of encoder bi-LSTM layers.",
+        )
+        parser.add_argument(
+            "--lstm-size", type=int, metavar="N", help="LSTM hidden size."
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="Embedding dimension of the decoder target tokens.",
+        )
+        parser.add_argument(
+            "--decoder-hidden-dim",
+            type=int,
+            metavar="N",
+            help="Decoder LSTM hidden dimension.",
+        )
+        parser.add_argument(
+            "--decoder-num-layers",
+            type=int,
+            metavar="N",
+            help="Number of decoder LSTM layers.",
+        )
+        parser.add_argument(
+            "--attention-dim",
+            type=int,
+            metavar="N",
+            help="Hidden layer dimension in MLP attention.",
+        )
+        parser.add_argument(
+            "--output-layer-dim",
+            type=int,
+            metavar="N",
+            help="Hidden layer dim for linear layer prior to output projection.",
+        )
+        parser.add_argument(
+            "--load-pretrained-encoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take encoder weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--load-pretrained-decoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take decoder weights from (for initialization)",
+        )
+
+    @classmethod
+    def build_encoder(cls, args, task):
+        encoder = BerardEncoder(
+            input_layers=literal_eval(args.input_layers),
+            conv_layers=literal_eval(args.conv_layers),
+            in_channels=args.input_channels,
+            input_feat_per_channel=args.input_feat_per_channel,
+            num_blstm_layers=args.num_blstm_layers,
+            lstm_size=args.lstm_size,
+            dropout=args.dropout,
+        )
+        if getattr(args, "load_pretrained_encoder_from", None) is not None:
+            encoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=encoder, checkpoint=args.load_pretrained_encoder_from
+            )
+        return encoder
+
+    @classmethod
+    def build_decoder(cls, args, task):
+        decoder = LSTMDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            num_layers=args.decoder_num_layers,
+            hidden_size=args.decoder_hidden_dim,
+            dropout=args.dropout,
+            encoder_output_dim=2 * args.lstm_size,  # bidirectional
+            attention_dim=args.attention_dim,
+            output_layer_dim=args.output_layer_dim,
+        )
+        if getattr(args, "load_pretrained_decoder_from", None) is not None:
+            decoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=decoder, checkpoint=args.load_pretrained_decoder_from
+            )
+        return decoder
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        encoder = cls.build_encoder(args, task)
+        decoder = cls.build_decoder(args, task)
+
+        return cls(encoder, decoder)
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        # lprobs is a (B, T, D) tensor
+        lprobs.batch_first = True
+        return lprobs
+
+
+class BerardEncoder(FairseqEncoder):
+    def __init__(
+        self,
+        input_layers: List[int],
+        conv_layers: List[Tuple[int]],
+        in_channels: int,
+        input_feat_per_channel: int,
+        num_blstm_layers: int,
+        lstm_size: int,
+        dropout: float,
+    ):
+        """
+        Args:
+            input_layers: list of linear layer dimensions. These layers are
+                applied to the input features and are followed by tanh and
+                possibly dropout.
+            conv_layers: list of conv2d layer configurations. A configuration is
+                a tuple (out_channels, conv_kernel_size, stride).
+            in_channels: number of input channels.
+            input_feat_per_channel: number of input features per channel. These
+                are speech features, typically 40 or 80.
+            num_blstm_layers: number of bidirectional LSTM layers.
+            lstm_size: size of the LSTM hidden (and cell) size.
+            dropout: dropout probability. Dropout can be applied after the
+                linear layers and LSTM layers but not to the convolutional
+                layers.
+        """
+        super().__init__(None)
+
+        self.input_layers = nn.ModuleList()
+        in_features = input_feat_per_channel
+        for out_features in input_layers:
+            if dropout > 0:
+                self.input_layers.append(
+                    nn.Sequential(
+                        nn.Linear(in_features, out_features), nn.Dropout(p=dropout)
+                    )
+                )
+            else:
+                self.input_layers.append(nn.Linear(in_features, out_features))
+            in_features = out_features
+
+        self.in_channels = in_channels
+        self.input_dim = input_feat_per_channel
+        self.conv_kernel_sizes_and_strides = []
+        self.conv_layers = nn.ModuleList()
+        lstm_input_dim = input_layers[-1]
+        for conv_layer in conv_layers:
+            out_channels, conv_kernel_size, conv_stride = conv_layer
+            self.conv_layers.append(
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    conv_kernel_size,
+                    stride=conv_stride,
+                    padding=conv_kernel_size // 2,
+                )
+            )
+            self.conv_kernel_sizes_and_strides.append((conv_kernel_size, conv_stride))
+            in_channels = out_channels
+            lstm_input_dim //= conv_stride
+
+        lstm_input_dim *= conv_layers[-1][0]
+        self.lstm_size = lstm_size
+        self.num_blstm_layers = num_blstm_layers
+        self.lstm = nn.LSTM(
+            input_size=lstm_input_dim,
+            hidden_size=lstm_size,
+            num_layers=num_blstm_layers,
+            dropout=dropout,
+            bidirectional=True,
+        )
+        self.output_dim = 2 * lstm_size  # bidirectional
+        if dropout > 0:
+            self.dropout = nn.Dropout(p=dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        """
+        Args
+            src_tokens: padded tensor (B, T, C * feat)
+            src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        bsz, max_seq_len, _ = src_tokens.size()
+        # (B, C, T, feat)
+        x = (
+            src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+        for input_layer in self.input_layers:
+            x = input_layer(x)
+            x = torch.tanh(x)
+
+        for conv_layer in self.conv_layers:
+            x = conv_layer(x)
+
+        bsz, _, output_seq_len, _ = x.size()
+
+        # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) ->
+        # (T, B, C * feat)
+        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1)
+
+        input_lengths = src_lengths.clone()
+        for k, s in self.conv_kernel_sizes_and_strides:
+            p = k // 2
+            input_lengths = (input_lengths.float() + 2 * p - k) / s + 1
+            input_lengths = input_lengths.floor().long()
+
+        packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths)
+
+        h0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_()
+        c0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_()
+        packed_outs, _ = self.lstm(packed_x, (h0, c0))
+
+        # unpack outputs and apply dropout
+        x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_outs)
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        encoder_padding_mask = (
+            lengths_to_padding_mask(output_lengths).to(src_tokens.device).t()
+        )
+
+        return {
+            "encoder_out": x,  # (T, B, C)
+            "encoder_padding_mask": encoder_padding_mask,  # (T, B)
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+            1, new_order
+        )
+        encoder_out["encoder_padding_mask"] = encoder_out[
+            "encoder_padding_mask"
+        ].index_select(1, new_order)
+        return encoder_out
+
+
+class MLPAttention(nn.Module):
+    """The original attention from Badhanau et al. (2014)
+
+    https://arxiv.org/abs/1409.0473, based on a Multi-Layer Perceptron.
+    The attention score between position i in the encoder and position j in the
+    decoder is: alpha_ij = V_a * tanh(W_ae * enc_i + W_ad * dec_j + b_a)
+    """
+
+    def __init__(self, decoder_hidden_state_dim, context_dim, attention_dim):
+        super().__init__()
+
+        self.context_dim = context_dim
+        self.attention_dim = attention_dim
+        # W_ae and b_a
+        self.encoder_proj = nn.Linear(context_dim, self.attention_dim, bias=True)
+        # W_ad
+        self.decoder_proj = nn.Linear(
+            decoder_hidden_state_dim, self.attention_dim, bias=False
+        )
+        # V_a
+        self.to_scores = nn.Linear(self.attention_dim, 1, bias=False)
+
+    def forward(self, decoder_state, source_hids, encoder_padding_mask):
+        """The expected input dimensions are:
+        decoder_state: bsz x decoder_hidden_state_dim
+        source_hids: src_len x bsz x context_dim
+        encoder_padding_mask: src_len x bsz
+        """
+        src_len, bsz, _ = source_hids.size()
+        # (src_len*bsz) x context_dim (to feed through linear)
+        flat_source_hids = source_hids.view(-1, self.context_dim)
+        # (src_len*bsz) x attention_dim
+        encoder_component = self.encoder_proj(flat_source_hids)
+        # src_len x bsz x attention_dim
+        encoder_component = encoder_component.view(src_len, bsz, self.attention_dim)
+        # 1 x bsz x attention_dim
+        decoder_component = self.decoder_proj(decoder_state).unsqueeze(0)
+        # Sum with broadcasting and apply the non linearity
+        # src_len x bsz x attention_dim
+        hidden_att = torch.tanh(
+            (decoder_component + encoder_component).view(-1, self.attention_dim)
+        )
+        # Project onto the reals to get attentions scores (src_len x bsz)
+        attn_scores = self.to_scores(hidden_att).view(src_len, bsz)
+
+        # Mask + softmax (src_len x bsz)
+        if encoder_padding_mask is not None:
+            attn_scores = (
+                attn_scores.float()
+                .masked_fill_(encoder_padding_mask, float("-inf"))
+                .type_as(attn_scores)
+            )  # FP16 support: cast to float and back
+        # srclen x bsz
+        normalized_masked_attn_scores = F.softmax(attn_scores, dim=0)
+
+        # Sum weighted sources (bsz x context_dim)
+        attn_weighted_context = (
+            source_hids * normalized_masked_attn_scores.unsqueeze(2)
+        ).sum(dim=0)
+
+        return attn_weighted_context, normalized_masked_attn_scores
+
+
+class LSTMDecoder(FairseqIncrementalDecoder):
+    def __init__(
+        self,
+        dictionary,
+        embed_dim,
+        num_layers,
+        hidden_size,
+        dropout,
+        encoder_output_dim,
+        attention_dim,
+        output_layer_dim,
+    ):
+        """
+        Args:
+            dictionary: target text dictionary.
+            embed_dim: embedding dimension for target tokens.
+            num_layers: number of LSTM layers.
+            hidden_size: hidden size for LSTM layers.
+            dropout: dropout probability. Dropout can be applied to the
+                embeddings, the LSTM layers, and the context vector.
+            encoder_output_dim: encoder output dimension (hidden size of
+                encoder LSTM).
+            attention_dim: attention dimension for MLP attention.
+            output_layer_dim: size of the linear layer prior to output
+                projection.
+        """
+        super().__init__(dictionary)
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = nn.Embedding(num_embeddings, embed_dim, padding_idx)
+        if dropout > 0:
+            self.dropout = nn.Dropout(p=dropout)
+        else:
+            self.dropout = None
+
+        self.layers = nn.ModuleList()
+        for layer_id in range(num_layers):
+            input_size = embed_dim if layer_id == 0 else encoder_output_dim
+            self.layers.append(
+                nn.LSTMCell(input_size=input_size, hidden_size=hidden_size)
+            )
+
+        self.context_dim = encoder_output_dim
+        self.attention = MLPAttention(
+            decoder_hidden_state_dim=hidden_size,
+            context_dim=encoder_output_dim,
+            attention_dim=attention_dim,
+        )
+
+        self.deep_output_layer = nn.Linear(
+            hidden_size + encoder_output_dim + embed_dim, output_layer_dim
+        )
+        self.output_projection = nn.Linear(output_layer_dim, num_embeddings)
+
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
+    ):
+        encoder_padding_mask = encoder_out["encoder_padding_mask"]
+        encoder_outs = encoder_out["encoder_out"]
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        bsz, seqlen = prev_output_tokens.size()
+
+        srclen = encoder_outs.size(0)
+
+        # embed tokens
+        embeddings = self.embed_tokens(prev_output_tokens)
+        x = embeddings
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # initialize previous states (or get from cache during incremental
+        # generation)
+        cached_state = utils.get_incremental_state(
+            self, incremental_state, "cached_state"
+        )
+        if cached_state is not None:
+            prev_hiddens, prev_cells = cached_state
+        else:
+            prev_hiddens = [encoder_out["encoder_out"].mean(dim=0)] * self.num_layers
+            prev_cells = [x.new_zeros(bsz, self.hidden_size)] * self.num_layers
+
+        attn_scores = x.new_zeros(bsz, srclen)
+        attention_outs = []
+        outs = []
+        for j in range(seqlen):
+            input = x[j, :, :]
+            attention_out = None
+            for i, layer in enumerate(self.layers):
+                # the previous state is one layer below except for the bottom
+                # layer where the previous state is the state emitted by the
+                # top layer
+                hidden, cell = layer(
+                    input,
+                    (
+                        prev_hiddens[(i - 1) % self.num_layers],
+                        prev_cells[(i - 1) % self.num_layers],
+                    ),
+                )
+                if self.dropout is not None:
+                    hidden = self.dropout(hidden)
+                prev_hiddens[i] = hidden
+                prev_cells[i] = cell
+                if attention_out is None:
+                    attention_out, attn_scores = self.attention(
+                        hidden, encoder_outs, encoder_padding_mask
+                    )
+                    if self.dropout is not None:
+                        attention_out = self.dropout(attention_out)
+                    attention_outs.append(attention_out)
+                input = attention_out
+
+            # collect the output of the top layer
+            outs.append(hidden)
+
+        # cache previous states (no-op except during incremental generation)
+        utils.set_incremental_state(
+            self, incremental_state, "cached_state", (prev_hiddens, prev_cells)
+        )
+
+        # collect outputs across time steps
+        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
+        attention_outs_concat = torch.cat(attention_outs, dim=0).view(
+            seqlen, bsz, self.context_dim
+        )
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        attention_outs_concat = attention_outs_concat.transpose(0, 1)
+
+        # concat LSTM output, attention output and embedding
+        # before output projection
+        x = torch.cat((x, attention_outs_concat, embeddings), dim=2)
+        x = self.deep_output_layer(x)
+        x = torch.tanh(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # project back to size of vocabulary
+        x = self.output_projection(x)
+
+        # to return the full attn_scores tensor, we need to fix the decoder
+        # to account for subsampling input frames
+        # return x, attn_scores
+        return x, None
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        super().reorder_incremental_state(incremental_state, new_order)
+        cached_state = utils.get_incremental_state(
+            self, incremental_state, "cached_state"
+        )
+        if cached_state is None:
+            return
+
+        def reorder_state(state):
+            if isinstance(state, list):
+                return [reorder_state(state_i) for state_i in state]
+            return state.index_select(0, new_order)
+
+        new_state = tuple(map(reorder_state, cached_state))
+        utils.set_incremental_state(self, incremental_state, "cached_state", new_state)
+
+
+@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard")
+def berard(args):
+    """The original version: "End-to-End Automatic Speech Translation of
+    Audiobooks" (https://arxiv.org/abs/1802.04200)
+    """
+    args.input_layers = getattr(args, "input_layers", "[256, 128]")
+    args.conv_layers = getattr(args, "conv_layers", "[(16, 3, 2), (16, 3, 2)]")
+    args.num_blstm_layers = getattr(args, "num_blstm_layers", 3)
+    args.lstm_size = getattr(args, "lstm_size", 256)
+    args.dropout = getattr(args, "dropout", 0.2)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128)
+    args.decoder_num_layers = getattr(args, "decoder_num_layers", 2)
+    args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 512)
+    args.attention_dim = getattr(args, "attention_dim", 512)
+    args.output_layer_dim = getattr(args, "output_layer_dim", 128)
+    args.load_pretrained_encoder_from = getattr(
+        args, "load_pretrained_encoder_from", None
+    )
+    args.load_pretrained_decoder_from = getattr(
+        args, "load_pretrained_decoder_from", None
+    )
+
+
+@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_256_3_3")
+def berard_256_3_3(args):
+    """Used in
+    * "Harnessing Indirect Training Data for End-to-End Automatic Speech
+    Translation: Tricks of the Trade" (https://arxiv.org/abs/1909.06515)
+    * "CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus"
+    (https://arxiv.org/pdf/2002.01320.pdf)
+    * "Self-Supervised Representations Improve End-to-End Speech Translation"
+    (https://arxiv.org/abs/2006.12124)
+    """
+    args.decoder_num_layers = getattr(args, "decoder_num_layers", 3)
+    berard(args)
+
+
+@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_512_3_2")
+def berard_512_3_2(args):
+    args.num_blstm_layers = getattr(args, "num_blstm_layers", 3)
+    args.lstm_size = getattr(args, "lstm_size", 512)
+    args.dropout = getattr(args, "dropout", 0.3)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_num_layers = getattr(args, "decoder_num_layers", 2)
+    args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 1024)
+    args.attention_dim = getattr(args, "attention_dim", 512)
+    args.output_layer_dim = getattr(args, "output_layer_dim", 256)
+    berard(args)
+
+
+@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_512_5_3")
+def berard_512_5_3(args):
+    args.num_blstm_layers = getattr(args, "num_blstm_layers", 5)
+    args.lstm_size = getattr(args, "lstm_size", 512)
+    args.dropout = getattr(args, "dropout", 0.3)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_num_layers = getattr(args, "decoder_num_layers", 3)
+    args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 1024)
+    args.attention_dim = getattr(args, "attention_dim", 512)
+    args.output_layer_dim = getattr(args, "output_layer_dim", 256)
+    berard(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
new file mode 100644
index 0000000000..4d0fc02aee
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
@@ -0,0 +1,443 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from fairseq import checkpoint_utils, utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_text.modules.convolution import infer_conv_output_dim
+from fairseq.models.transformer import Embedding, TransformerDecoder
+from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerEncoderLayer
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("convtransformer")
+class ConvTransformerModel(FairseqEncoderDecoderModel):
+    """
+    Transformer-based Speech translation model from ESPNet-ST
+    https://arxiv.org/abs/2004.10234
+    """
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--activation-fn",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--activation-dropout",
+            "--relu-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN.",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--decoder-output-dim",
+            type=int,
+            metavar="N",
+            help="decoder output dimension (extra linear layer if different from decoder embed dim)",
+        )
+        parser.add_argument(
+            "--share-decoder-input-output-embed",
+            action="store_true",
+            help="share decoder input and output embeddings",
+        )
+        parser.add_argument(
+            "--layernorm-embedding",
+            action="store_true",
+            help="add layernorm to embedding",
+        )
+        parser.add_argument(
+            "--no-scale-embedding",
+            action="store_true",
+            help="if True, dont scale embeddings",
+        )
+        parser.add_argument(
+            "--load-pretrained-encoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take encoder weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--load-pretrained-decoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take decoder weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--conv-out-channels",
+            type=int,
+            metavar="INT",
+            help="the number of output channels of conv layer",
+        )
+
+    @classmethod
+    def build_encoder(cls, args):
+        encoder = ConvTransformerEncoder(args)
+        if getattr(args, "load_pretrained_encoder_from", None) is not None:
+            encoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=encoder, checkpoint=args.load_pretrained_encoder_from
+            )
+        return encoder
+
+    @classmethod
+    def build_decoder(cls, args, task, embed_tokens):
+        decoder = TransformerDecoderNoExtra(args, task.target_dictionary, embed_tokens)
+        if getattr(args, "load_pretrained_decoder_from", None) is not None:
+            decoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=decoder, checkpoint=args.load_pretrained_decoder_from
+            )
+        return decoder
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        def build_embedding(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            return Embedding(num_embeddings, embed_dim, padding_idx)
+
+        decoder_embed_tokens = build_embedding(
+            task.target_dictionary, args.decoder_embed_dim
+        )
+        encoder = cls.build_encoder(args)
+        decoder = cls.build_decoder(args, task, decoder_embed_tokens)
+        return cls(encoder, decoder)
+
+    @staticmethod
+    @torch.jit.unused
+    def set_batch_first(lprobs):
+        lprobs.batch_first = True
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+        if self.training:
+            self.set_batch_first(lprobs)
+        return lprobs
+
+    def output_layout(self):
+        return "BTD"
+
+    """
+    The forward method inherited from the base class has a **kwargs argument in
+    its input, which is not supported in torchscript. This method overrites the forward
+    method definition without **kwargs.
+    """
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths)
+        decoder_out = self.decoder(
+            prev_output_tokens=prev_output_tokens, encoder_out=encoder_out
+        )
+        return decoder_out
+
+
+class ConvTransformerEncoder(FairseqEncoder):
+    """Conv + Transformer encoder"""
+
+    def __init__(self, args):
+        """Construct an Encoder object."""
+        super().__init__(None)
+
+        self.dropout = args.dropout
+        self.embed_scale = (
+            1.0 if args.no_scale_embedding else math.sqrt(args.encoder_embed_dim)
+        )
+        self.padding_idx = 1
+        self.in_channels = 1
+        self.input_dim = args.input_feat_per_channel
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, args.conv_out_channels, 3, stride=2, padding=3 // 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(
+                args.conv_out_channels,
+                args.conv_out_channels,
+                3,
+                stride=2,
+                padding=3 // 2,
+            ),
+            torch.nn.ReLU(),
+        )
+        transformer_input_dim = infer_conv_output_dim(
+            self.in_channels, self.input_dim, args.conv_out_channels
+        )
+        self.out = torch.nn.Linear(transformer_input_dim, args.encoder_embed_dim)
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions,
+            args.encoder_embed_dim,
+            self.padding_idx,
+            learned=False,
+        )
+
+        self.transformer_layers = nn.ModuleList([])
+        self.transformer_layers.extend(
+            [TransformerEncoderLayer(args) for i in range(args.encoder_layers)]
+        )
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(args.encoder_embed_dim)
+        else:
+            self.layer_norm = None
+
+    def pooling_ratio(self):
+        return 4
+
+    def forward(self, src_tokens, src_lengths):
+        """Encode input sequence.
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+        bsz, max_seq_len, _ = src_tokens.size()
+        x = (
+            src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+        x = self.conv(x)
+        bsz, _, output_seq_len, _ = x.size()
+        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1)
+        x = self.out(x)
+        x = self.embed_scale * x
+
+        subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5)
+        input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long()
+        input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to(
+            input_len_0.device
+        )
+        input_lengths = torch.min(input_len_0, input_len_1)
+
+        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
+
+        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
+        x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        for layer in self.transformer_layers:
+            x = layer(x, encoder_padding_mask)
+
+        if not encoder_padding_mask.any():
+            maybe_encoder_padding_mask = None
+        else:
+            maybe_encoder_padding_mask = encoder_padding_mask
+
+        return {
+            "encoder_out": [x],
+            "encoder_padding_mask": [maybe_encoder_padding_mask]
+            if maybe_encoder_padding_mask is not None
+            else [],
+            "encoder_embedding": [],
+            "encoder_states": [],
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)]
+        if len(encoder_out["encoder_padding_mask"]) == 0:
+            new_encoder_padding_mask = []
+        else:
+            new_encoder_padding_mask = [
+                (encoder_out["encoder_padding_mask"][0]).index_select(0, new_order)
+            ]
+        if len(encoder_out["encoder_embedding"]) == 0:
+            new_encoder_embedding = []
+        else:
+            new_encoder_embedding = [
+                (encoder_out["encoder_embedding"][0]).index_select(0, new_order)
+            ]
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return {
+            "encoder_out": new_encoder_out,
+            "encoder_padding_mask": new_encoder_padding_mask,
+            "encoder_embedding": new_encoder_embedding,
+            "encoder_states": encoder_states,
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+
+class TransformerDecoderNoExtra(TransformerDecoder):
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        # call scriptable method from parent class
+        x, _ = self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+        return x, None
+
+
+@register_model_architecture(model_name="convtransformer", arch_name="convtransformer")
+def base_architecture(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.max_source_positions = getattr(args, "max_source_positions", 3000)
+    args.max_target_positions = getattr(args, "max_target_positions", 1024)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.conv_out_channels = getattr(args, "conv_out_channels", args.encoder_embed_dim)
+
+
+@register_model_architecture("convtransformer", "convtransformer_espnet")
+def convtransformer_espnet(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
new file mode 100644
index 0000000000..d78427f687
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from argparse import Namespace
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import fairseq.data.audio.feature_transforms.utterance_cmvn as utt_cmvn
+from fairseq.data import encoders
+from fairseq.data.audio.audio_utils import convert_waveform as convert_wav
+from fairseq.data.audio.audio_utils import get_fbank
+from fairseq.data.audio.audio_utils import get_waveform as get_wav
+from fairseq.data.audio.speech_to_text_dataset import SpeechToTextDataset
+
+logger = logging.getLogger(__name__)
+
+
+class S2THubInterface(nn.Module):
+    def __init__(self, cfg, task, model):
+        super().__init__()
+        self.cfg = cfg
+        self.task = task
+        self.model = model
+        self.model.eval()
+        self.generator = self.task.build_generator([self.model], self.cfg.generation)
+
+    @classmethod
+    def get_model_input(cls, task, audio: Union[str, torch.Tensor]):
+        input_type = task.data_cfg.hub.get("input_type", "fbank80")
+        if input_type == "fbank80_w_utt_cmvn":
+            if isinstance(audio, str):
+                feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio))
+                feat = feat.unsqueeze(0)  # T x D -> 1 x T x D
+            else:
+                import torchaudio.compliance.kaldi as kaldi
+
+                feat = kaldi.fbank(audio, num_mel_bins=80).numpy()  # 1 x T x D
+        elif input_type in {"waveform", "standardized_waveform"}:
+            if isinstance(audio, str):
+                feat, sr = get_wav(audio)  # C x T
+                feat, _ = convert_wav(
+                    feat, sr, to_sample_rate=16_000, to_mono=True
+                )  # C x T -> 1 x T
+            else:
+                feat = audio.numpy()
+        else:
+            raise ValueError(f"Unknown value: input_type = {input_type}")
+
+        src_lengths = torch.Tensor([feat.shape[1]]).long()
+        src_tokens = torch.from_numpy(feat)  # 1 x T (x D)
+        if input_type == "standardized_waveform":
+            with torch.no_grad():
+                src_tokens = F.layer_norm(src_tokens, src_tokens.shape)
+
+        return {
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "prev_output_tokens": None,
+            },
+            "target_lengths": None,
+            "speaker": None,
+        }
+
+    @classmethod
+    def detokenize(cls, task, tokens):
+        text = task.tgt_dict.string(tokens)
+        tkn_cfg = task.data_cfg.bpe_tokenizer
+        tokenizer = encoders.build_bpe(Namespace(**tkn_cfg))
+        return text if tokenizer is None else tokenizer.decode(text)
+
+    @classmethod
+    def get_prefix_token(cls, task, lang):
+        prefix_size = int(task.data_cfg.prepend_tgt_lang_tag)
+        prefix_tokens = None
+        if prefix_size > 0:
+            assert lang is not None
+            lang_tag = SpeechToTextDataset.get_lang_tag_idx(lang, task.tgt_dict)
+            prefix_tokens = torch.Tensor([lang_tag]).long().unsqueeze(0)
+        return prefix_tokens
+
+    @classmethod
+    def get_prediction(
+        cls, task, model, generator, sample, tgt_lang=None, synthesize_speech=False
+    ) -> Union[str, Tuple[str, Tuple[torch.Tensor, int]]]:
+        _tgt_lang = tgt_lang or task.data_cfg.hub.get("tgt_lang", None)
+        prefix = cls.get_prefix_token(task, _tgt_lang)
+        pred_tokens = generator.generate([model], sample, prefix_tokens=prefix)
+        pred = cls.detokenize(task, pred_tokens[0][0]["tokens"])
+        eos_token = task.data_cfg.config.get("eos_token", None)
+        if eos_token:
+            pred = " ".join(pred.split(" ")[:-1])
+
+        if synthesize_speech:
+            pfx = f"{_tgt_lang}_" if task.data_cfg.prepend_tgt_lang_tag else ""
+            tts_model_id = task.data_cfg.hub.get(f"{pfx}tts_model_id", None)
+            speaker = task.data_cfg.hub.get(f"{pfx}speaker", None)
+            if tts_model_id is None:
+                logger.warning("TTS model configuration not found")
+            else:
+                _repo, _id = tts_model_id.split(":")
+                tts_model = torch.hub.load(_repo, _id, verbose=False)
+                pred = (pred, tts_model.predict(pred, speaker=speaker))
+        return pred
+
+    def predict(
+        self,
+        audio: Union[str, torch.Tensor],
+        tgt_lang: Optional[str] = None,
+        synthesize_speech: bool = False,
+    ) -> Union[str, Tuple[str, Tuple[torch.Tensor, int]]]:
+        # `audio` is either a file path or a 1xT Tensor
+        # return either text or (text, synthetic speech)
+        sample = self.get_model_input(self.task, audio)
+        return self.get_prediction(
+            self.task,
+            self.model,
+            self.generator,
+            sample,
+            tgt_lang=tgt_lang,
+            synthesize_speech=synthesize_speech,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
new file mode 100644
index 0000000000..2d330f96f6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
@@ -0,0 +1,487 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from fairseq.models import FairseqEncoder
+from fairseq.models.speech_to_text import ConvTransformerEncoder
+from fairseq.models.speech_to_text.utils import (
+    attention_suppression,
+    lengths_to_encoder_padding_mask,
+    segments_to_sequence,
+    sequence_to_segments,
+)
+from fairseq.modules import MultiheadAttention, TransformerEncoderLayer
+
+# ------------------------------------------------------------------------------
+#   AugmentedMemoryConvTransformerEncoder
+# ------------------------------------------------------------------------------
+
+
+class AugmentedMemoryConvTransformerEncoder(ConvTransformerEncoder):
+    def __init__(self, args):
+        super().__init__(args)
+
+        args.encoder_stride = self.stride()
+
+        self.left_context = args.left_context // args.encoder_stride
+
+        self.right_context = args.right_context // args.encoder_stride
+
+        self.left_context_after_stride = args.left_context // args.encoder_stride
+        self.right_context_after_stride = args.right_context // args.encoder_stride
+
+        self.transformer_layers = nn.ModuleList([])
+        self.transformer_layers.extend(
+            [
+                AugmentedMemoryTransformerEncoderLayer(args)
+                for i in range(args.encoder_layers)
+            ]
+        )
+
+    def stride(self):
+        # Hard coded here. Should infer from convs in future
+        stride = 4
+        return stride
+
+    def forward(self, src_tokens, src_lengths, states=None):
+        """Encode input sequence.
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+        bsz, max_seq_len, _ = src_tokens.size()
+        x = (
+            src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+        x = self.conv(x)
+        bsz, _, output_seq_len, _ = x.size()
+        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1)
+        x = self.out(x)
+        x = self.embed_scale * x
+
+        subsampling_factor = 1.0 * max_seq_len / output_seq_len
+        input_lengths = torch.max(
+            (src_lengths.float() / subsampling_factor).ceil().long(),
+            x.size(0) * src_lengths.new_ones([src_lengths.size(0)]).long(),
+        )
+
+        encoder_padding_mask, _ = lengths_to_encoder_padding_mask(
+            input_lengths, batch_first=True
+        )
+
+        # TODO: fix positional embedding
+        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
+
+        x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # State to store memory banks etc.
+        if states is None:
+            states = [
+                {"memory_banks": None, "encoder_states": None}
+                for i in range(len(self.transformer_layers))
+            ]
+
+        for i, layer in enumerate(self.transformer_layers):
+            # x size:
+            # (self.left_size + self.segment_size + self.right_size)
+            # / self.stride, num_heads, dim
+            # TODO: Consider mask here
+            x = layer(x, states[i])
+            states[i]["encoder_states"] = x[
+                self.left_context_after_stride : -self.right_context_after_stride
+            ]
+
+        lengths = (
+            (
+                ~encoder_padding_mask[
+                    :, self.left_context_after_stride : -self.right_context_after_stride
+                ]
+            )
+            .sum(dim=1, keepdim=True)
+            .long()
+        )
+
+        return states[-1]["encoder_states"], lengths, states
+
+
+# ------------------------------------------------------------------------------
+#   AugmentedMemoryTransformerEncoderLayer
+# ------------------------------------------------------------------------------
+class AugmentedMemoryTransformerEncoderLayer(TransformerEncoderLayer):
+    def __init__(self, args):
+        super().__init__(args)
+
+        self.left_context = args.left_context // args.encoder_stride
+        self.right_context = args.right_context // args.encoder_stride
+
+    def forward(self, x, state):
+
+        length, batch_size, x_dim = x.size()
+
+        residual = x
+
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        # init_state
+        if state.get("memory_banks", None) is None:
+            state["memory_banks"] = []
+
+        # TODO reseach new sum_query method
+        seg_start = self.left_context
+        seg_end = length - self.right_context
+        if seg_start < seg_end:
+            summarization_query = torch.mean(x[seg_start:seg_end], keepdim=True, dim=0)
+        else:
+            summarization_query = x.new_zeros(1, batch_size, x_dim)
+
+        x = torch.cat([x, summarization_query], dim=0)
+
+        x = self.self_attn(input_and_summary=x, state=state)
+
+        x = self.dropout_module(x)
+        x = residual + x
+
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        return x
+
+    def build_self_attention(self, embed_dim, args):
+        return AugmentedMemoryMultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            tanh_on_mem=True,
+            max_memory_size=args.max_memory_size,
+        )
+
+
+# ------------------------------------------------------------------------------
+#   AugmentedMemoryMultiheadAttention
+# ------------------------------------------------------------------------------
+class AugmentedMemoryMultiheadAttention(MultiheadAttention):
+    """
+    Augmented Memory Attention from
+    Streaming Transformer-based Acoustic Models
+    Using Self-attention with Augmented Memory
+    https://arxiv.org/abs/2005.08042
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        tanh_on_mem=False,
+        memory_dim=None,
+        std_scale=0.5,  # 0.5 based on https://arxiv.org/abs/2005.09137
+        max_memory_size=-1,
+        disable_mem_on_mem_attn=True,
+    ):
+        super().__init__(
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            self_attention,
+            encoder_decoder_attention,
+            q_noise,
+            qn_block_size,
+        )
+
+        self.memory_dim = memory_dim if memory_dim is not None else embed_dim
+        self.std_scale = std_scale
+        self.disable_mem_on_mem_attn = disable_mem_on_mem_attn
+
+        # This Operator was used for factorization in PySpeech
+        self.v2e = lambda x: x
+
+        if tanh_on_mem:
+            self.squash_mem = torch.tanh
+            self.nonlinear_squash_mem = True
+        else:
+            self.squash_mem = lambda x: x
+            self.nonlinear_squash_mem = False
+
+        self.max_memory_size = max_memory_size
+
+    def forward(self, input_and_summary, state):
+        """
+        input: Encoder states of current segment with left or right context,
+            plus one summarization query
+
+        """
+
+        length, batch_size, _ = input_and_summary.shape
+        length = length - 1  # not include sum_query, last index
+
+        memory = state["memory_banks"]
+        # TODO: positional embedding on memory
+
+        if self.max_memory_size > -1 and len(memory) > self.max_memory_size:
+            # TODO: need to fix here
+            if self.max_memory_size == 0:
+                memory = memory.new_zeros(1, memory.size(1), self.memory_dim)
+            else:
+                memory = memory[-self.max_memory_size :]
+
+        memory_and_input = torch.cat(memory + [input_and_summary[:-1]], dim=0)
+        input_and_sum_query = input_and_summary
+
+        q = self.q_proj(self.v2e(input_and_sum_query))
+        k = self.k_proj(self.v2e(memory_and_input))
+        v = self.v_proj(self.v2e(memory_and_input))
+
+        q = (
+            q.contiguous()
+            .view(-1, batch_size * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+            * self.scaling
+        )
+        k = (
+            k.contiguous()
+            .view(-1, batch_size * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+
+        v = (
+            v.contiguous()
+            .view(-1, batch_size * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+
+        attention_weights = torch.bmm(q, k.transpose(1, 2))
+
+        if self.disable_mem_on_mem_attn:
+            attention_weights = self.suppress_mem_on_mem_attention(
+                batch_size, self.num_heads, len(memory), attention_weights
+            )
+
+        if self.std_scale is not None:
+            attention_weights = attention_suppression(attention_weights, self.std_scale)
+
+        assert list(attention_weights.shape) == [
+            batch_size * self.num_heads,
+            length + 1,
+            length + len(memory),
+        ]
+
+        attention_weights = torch.nn.functional.softmax(
+            attention_weights.float(), dim=-1
+        ).type_as(attention_weights)
+
+        attention_probs = self.dropout_module(attention_weights)
+
+        # [T, T, B, n_head] + [T, B, n_head, d_head] -> [T, B, n_head, d_head]
+        attention = torch.bmm(attention_probs, v)
+
+        assert list(attention.shape) == [
+            batch_size * self.num_heads,
+            length + 1,
+            self.head_dim,
+        ]
+
+        attention = (
+            attention.transpose(0, 1)
+            .contiguous()
+            .view(length + 1, batch_size, self.embed_dim)
+        )
+
+        output_and_memory = self.out_proj(attention)
+
+        next_m = output_and_memory[-1:]
+        next_m = self.squash_mem(next_m)
+        output = output_and_memory[:-1]
+
+        state["memory_banks"].append(next_m)
+
+        return output
+
+    def suppress_mem_on_mem_attention(
+        self, B: int, num_heads: int, mem_size: int, attention_weight: Tensor
+    ):
+        """
+        Arguments:
+            - B: batch size
+            - num_heads: number of attention heads
+            - mem_size: size of memory bank
+            - attention_weight: a [B*num_heads, T + 1, T + mem_size] vector
+
+        Return:
+            modified attention_weight with [B*num_heads, -1, :mem_size] = -inf
+        """
+        attention_weight[:, -1, :mem_size] = float("-inf")
+        return attention_weight
+
+
+# ------------------------------------------------------------------------------
+#   SequenceEncoder
+# ------------------------------------------------------------------------------
+class SequenceEncoder(FairseqEncoder):
+    """
+    SequenceEncoder encodes sequences.
+
+    More specifically, `src_tokens` and `src_lengths` in `forward()` should
+    describe a batch of "complete" sequences rather than segments.
+
+    Segment-by-segment inference can be triggered by `segment_size`:
+    1) `segment_size` is None:
+        SequenceEncoder treats the input sequence as one single segment.
+    2) `segment_size` is not None (some int instead):
+        SequenceEncoder does the following:
+            1. breaks the input sequence into several segments
+            2. inference on each segment and collect the outputs
+            3. concatanete segment outputs into the output sequence.
+    Note that `segment_size` here shouldn't include additional left/right
+    contexts needed, for example if we wish to infer with LC-BLSTM where the
+    middle chunk size is 100 and right context is 20, `segment_size` should be
+    100.
+    """
+
+    def __init__(self, args, module):
+        super().__init__(None)
+
+        self.module = module
+        self.input_time_axis = 1
+        self.output_time_axis = 0
+        self.segment_size = args.segment_size
+        self.left_context = args.left_context
+        self.right_context = args.right_context
+
+    def forward(
+        self,
+        src_tokens: Tensor,
+        src_lengths: Tensor,
+        states=None,
+    ):
+
+        seg_src_tokens_lengths = sequence_to_segments(
+            sequence=src_tokens,
+            time_axis=self.input_time_axis,
+            lengths=src_lengths,
+            segment_size=self.segment_size,
+            extra_left_context=self.left_context,
+            extra_right_context=self.right_context,
+        )
+
+        seg_encoder_states_lengths: List[Tuple[Tensor, Tensor]] = []
+
+        for seg_src_tokens, seg_src_lengths in seg_src_tokens_lengths:
+            (seg_encoder_states, seg_enc_lengths, states) = self.module(
+                seg_src_tokens,
+                seg_src_lengths,
+                states=states,
+            )
+
+            seg_encoder_states_lengths.append((seg_encoder_states, seg_enc_lengths))
+
+        encoder_out, enc_lengths = segments_to_sequence(
+            segments=seg_encoder_states_lengths, time_axis=self.output_time_axis
+        )
+
+        encoder_padding_mask, _ = lengths_to_encoder_padding_mask(
+            enc_lengths, batch_first=True
+        )
+
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        return {
+            "encoder_out": [encoder_out],
+            "encoder_padding_mask": [encoder_padding_mask],
+            "encoder_embedding": [],
+            "encoder_states": [states],
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+    def incremental_encode(
+        self,
+        seg_src_tokens: Tensor,
+        seg_src_lengths: Tensor,
+        states=None,
+    ):
+        """
+        Different from forward function, this function takes segmented speech
+        as input, and append encoder states to previous states
+        """
+        (seg_encoder_states, seg_enc_lengths, states) = self.module(
+            seg_src_tokens,
+            seg_src_lengths,
+            states=states,
+        )
+        return seg_encoder_states, seg_enc_lengths, states
+
+
+# ------------------------------------------------------------------------------
+#   Augmented memory model decorator
+# ------------------------------------------------------------------------------
+def augmented_memory(klass):
+    class StreamSeq2SeqModel(klass):
+        @staticmethod
+        def add_args(parser):
+            super(StreamSeq2SeqModel, StreamSeq2SeqModel).add_args(parser)
+            parser.add_argument(
+                "--segment-size", type=int, required=True, help="Length of the segment."
+            )
+            parser.add_argument(
+                "--left-context",
+                type=int,
+                default=0,
+                help="Left context for the segment.",
+            )
+            parser.add_argument(
+                "--right-context",
+                type=int,
+                default=0,
+                help="Right context for the segment.",
+            )
+            parser.add_argument(
+                "--max-memory-size",
+                type=int,
+                default=-1,
+                help="Right context for the segment.",
+            )
+
+    StreamSeq2SeqModel.__name__ = klass.__name__
+    return StreamSeq2SeqModel
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
new file mode 100644
index 0000000000..526d7540c5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import List
+
+import torch
+import torch.nn as nn
+
+
+class Conv1dSubsampler(nn.Module):
+    """Convolutional subsampler: a stack of 1D convolution (along temporal
+    dimension) followed by non-linear activation via gated linear units
+    (https://arxiv.org/abs/1911.08460)
+
+    Args:
+        in_channels (int): the number of input channels
+        mid_channels (int): the number of intermediate channels
+        out_channels (int): the number of output channels
+        kernel_sizes (List[int]): the kernel size for each convolutional layer
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        mid_channels: int,
+        out_channels: int,
+        kernel_sizes: List[int] = (3, 3),
+    ):
+        super(Conv1dSubsampler, self).__init__()
+        self.n_layers = len(kernel_sizes)
+        self.conv_layers = nn.ModuleList(
+            nn.Conv1d(
+                in_channels if i == 0 else mid_channels // 2,
+                mid_channels if i < self.n_layers - 1 else out_channels * 2,
+                k,
+                stride=2,
+                padding=k // 2,
+            )
+            for i, k in enumerate(kernel_sizes)
+        )
+
+    def get_out_seq_lens_tensor(self, in_seq_lens_tensor):
+        out = in_seq_lens_tensor.clone()
+        for _ in range(self.n_layers):
+            out = ((out.float() - 1) / 2 + 1).floor().long()
+        return out
+
+    def forward(self, src_tokens, src_lengths):
+        bsz, in_seq_len, _ = src_tokens.size()  # B x T x (C x D)
+        x = src_tokens.transpose(1, 2).contiguous()  # -> B x (C x D) x T
+        for conv in self.conv_layers:
+            x = conv(x)
+            x = nn.functional.glu(x, dim=1)
+        _, _, out_seq_len = x.size()
+        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # -> T x B x (C x D)
+        return x, self.get_out_seq_lens_tensor(src_lengths)
+
+
+def infer_conv_output_dim(in_channels, input_dim, out_channels):
+    sample_seq_len = 200
+    sample_bsz = 10
+    x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim)
+    x = torch.nn.Conv2d(in_channels, out_channels, 3, stride=2, padding=3 // 2)(x)
+    x = torch.nn.Conv2d(out_channels, out_channels, 3, stride=2, padding=3 // 2)(x)
+    x = x.transpose(1, 2)
+    mb, seq = x.size()[:2]
+    return x.contiguous().view(mb, seq, -1).size(-1)
+
+
+class Conv2dSubsampler(nn.Module):
+    """Convolutional subsampler: a stack of 2D convolution based on ESPnet implementation
+    (https://github.com/espnet/espnet)
+
+    Args:
+        input_channels (int): the number of input channels
+        input_feat_per_channel (int): encoder input dimension per input channel
+        conv_out_channels (int): the number of output channels of conv layer
+        encoder_embed_dim (int): encoder dimentions
+    """
+
+    def __init__(
+        self,
+        input_channels: int,
+        input_feat_per_channel: int,
+        conv_out_channels: int,
+        encoder_embed_dim: int,
+    ):
+        super().__init__()
+        assert input_channels == 1, input_channels
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                input_channels, conv_out_channels, 3, stride=2, padding=3 // 2
+            ),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(
+                conv_out_channels,
+                conv_out_channels,
+                3,
+                stride=2,
+                padding=3 // 2,
+            ),
+            torch.nn.ReLU(),
+        )
+        transformer_input_dim = infer_conv_output_dim(
+            input_channels, input_feat_per_channel, conv_out_channels
+        )
+        self.out = torch.nn.Linear(transformer_input_dim, encoder_embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        B, T_i, C = src_tokens.size()
+        x = src_tokens.view(B, T_i, 1, C).transpose(1, 2).contiguous()
+        x = self.conv(x)
+        B, _, T_o, _ = x.size()
+        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(T_o, B, -1)
+        x = self.out(x)
+
+        subsampling_factor = int(T_i * 1.0 / T_o + 0.5)
+        input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long()
+        input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to(
+            input_len_0.device
+        )
+        input_lengths = torch.min(input_len_0, input_len_1)
+        return x, input_lengths
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py
new file mode 100644
index 0000000000..935d593078
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py
@@ -0,0 +1,1844 @@
+#!/usr/bin/env python3
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+import math
+import re
+from functools import partial
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch import device as Device
+
+from fairseq.models import FairseqEncoder
+from fairseq.models.speech_to_text.utils import (
+    NoOp,
+    attention_suppression,
+    layer_norm_backward_hook,
+    lengths_to_padding_mask,
+    segments_to_sequence,
+)
+
+try:
+    import torch.ao.quantization as quantization
+    from torch.ao.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
+except ImportError:
+    import torch.quantization as quantization
+    from torch.quantization.qconfig import (
+        default_dynamic_qconfig,
+        per_channel_dynamic_qconfig,
+    )
+
+
+class RelativePositionEmbedding(nn.Module):
+    """
+    Implementation according to https://arxiv.org/abs/1803.02155
+    """
+
+    def __init__(self, head_dim, max_position, norm_init=True):
+        super().__init__()
+        self.head_dim = head_dim
+        self.max_position = max_position
+        self.embeddings = nn.Parameter(torch.Tensor(max_position * 2 + 1, head_dim))
+        if norm_init:
+            nn.init.xavier_normal_(self.embeddings)
+        else:
+            nn.init.xavier_uniform_(self.embeddings)
+
+    def forward(self, input: Tensor):
+        output = nn.functional.embedding(input.long(), self.embeddings)
+        return output
+
+
+class Fp32LayerNorm(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        clamp_grad=True,
+        max_grad_value=256,
+        eps=1e-5,
+        elementwise_affine=True,
+    ):
+        super().__init__()
+        self.torch_module = torch.nn.LayerNorm(
+            input_dim, eps=eps, elementwise_affine=elementwise_affine
+        )
+        if clamp_grad:
+            hook = partial(layer_norm_backward_hook, clamp_value=max_grad_value)
+            self.torch_module.register_backward_hook(hook)
+
+    def forward(self, input):
+        output = torch.nn.functional.layer_norm(
+            input.float(),
+            self.torch_module.normalized_shape,
+            self.torch_module.weight.float()
+            if self.torch_module.weight is not None
+            else None,
+            self.torch_module.bias.float()
+            if self.torch_module.bias is not None
+            else None,
+            self.torch_module.eps,
+        ).type_as(input)
+        return output
+
+
+# ------------------------------------------------------------------------------
+#   PositionwiseFF
+# ------------------------------------------------------------------------------
+
+
+class PositionwiseFF(nn.Module):
+    """
+    FFN layer in transformer.
+
+    Args:
+        input_dim: input embedding dimension
+        ffn_dim: FFN layer inner dimension
+        dropout_on_fc1: dropout for first linear layer
+        dropout_on_fc2: dropout fr second linear layer
+        activation_fn: activation function used after first linear layer. \
+                Only relu or gelu is supported.
+
+    """
+
+    def __init__(
+        self, input_dim, ffn_dim, dropout_on_fc1, dropout_on_fc2, activation_fn
+    ):
+        super(PositionwiseFF, self).__init__()
+
+        self.input_dim = input_dim
+        self.ffn_dim = ffn_dim
+        if activation_fn == "relu":
+            ac = nn.ReLU()
+        elif activation_fn == "gelu":
+            ac = nn.GELU()
+        else:
+            raise ValueError("Unsupported activation_fn = ({})".format(activation_fn))
+
+        # fc1 -> ac -> dropout -> fc2 -> dropout
+        self.module = nn.Sequential(
+            nn.Linear(input_dim, ffn_dim),
+            ac,
+            nn.Dropout(dropout_on_fc1),
+            nn.Linear(ffn_dim, input_dim),
+            nn.Dropout(dropout_on_fc2),
+        )
+
+        self.layer_norm = Fp32LayerNorm(input_dim)
+
+    def forward(self, input):
+        module_out = self.module(self.layer_norm(input))
+        output = module_out + input
+
+        return output
+
+    def quantize_(self, params=None):
+        if params and "per_channel" in params and params["per_channel"]:
+            qconfig = per_channel_dynamic_qconfig
+        else:
+            qconfig = default_dynamic_qconfig
+        quantization.quantize_dynamic(
+            self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
+        )
+        return self
+
+
+# ------------------------------------------------------------------------------
+#   SummarizationLayer
+# ------------------------------------------------------------------------------
+
+
+class SummarizationLayer(nn.Module):
+    def __init__(self, method, segment_size, embedding_dim):
+        super(SummarizationLayer, self).__init__()
+        self.segment_size = segment_size
+        self.embedding_dim = embedding_dim
+        nonlin_match = re.match(r"nonlinear\((?P<act>[a-z]+),(?P<dim>[0-9]+)\)", method)
+        self.method = method
+        if method == "mean":
+            self.module = nn.AvgPool1d(
+                kernel_size=segment_size,
+                stride=segment_size,
+                ceil_mode=True,
+            )
+        elif method == "max":
+            self.module = nn.MaxPool1d(
+                kernel_size=segment_size,
+                stride=segment_size,
+                ceil_mode=True,
+            )
+        elif method == "linear":
+            self.module = nn.Linear(segment_size, 1)
+        elif nonlin_match:
+            nonlin_args = nonlin_match.groupdict()
+            act_type = nonlin_args["act"]
+            hid_dim = int(nonlin_args["dim"])
+            if act_type == "relu":
+                act = nn.ReLU()
+            elif act_type == "gelu":
+                act = nn.GELU()
+            else:
+                raise ValueError("Unsupported activation_fn = ({})".format(act_type))
+            self.module = nn.Sequential(
+                nn.Linear(segment_size, hid_dim),
+                act,
+                nn.Linear(hid_dim, 1),
+            )
+        else:
+            raise ValueError("Unsupported summarization method = ({})".format(method))
+
+    def forward(self, input):
+        # T, B, D -> B, D, T
+        input = input.permute(1, 2, 0)
+
+        if self.method == "mean" or self.method == "max":
+            output = self.module(input)
+            output = output.permute(2, 0, 1)
+            return output
+
+        full_seg_length = input.size(2) // self.segment_size * self.segment_size
+        if full_seg_length > 0:
+            # at least one seg is full
+            B = input.size(0)
+            D = input.size(1)
+            input_todo = (
+                input[:, :, :full_seg_length]
+                .contiguous()
+                .view(B, -1, self.segment_size)
+            )
+            output = self.module(input_todo)
+            output = output.view(B, D, -1)
+        else:
+            output = input.new_zeros(input.size(0), input.size(1), 0)
+        left = input.size(2) - full_seg_length
+        if left > 0:
+            # when last seg is not full, use zeros as last memory placeholder
+            zeros = input.new_zeros(input.size(0), input.size(1), 1)
+            output = torch.cat([output, zeros], dim=2)
+        output = output.permute(2, 0, 1)
+        return output
+
+
+# ------------------------------------------------------------------------------
+#   NoSegAugmentedMemoryMultiheadAttentionBmm
+# ------------------------------------------------------------------------------
+
+
+class NoSegAugmentedMemoryMultiheadAttentionBmm(nn.Module):
+    """
+    Whole utterance augmented memory multihead attention using BMM.
+
+    Different with previous augmented memory multihead attention where
+    the utterance is chunked into segments. Here we use attention mask
+    achieve so. The input embedding [right_context, utterance, summary]
+    is a concatenation of right context, utterance and summary.
+
+    Right context block is the concatenation of all the right context for
+    each segments. [right_context_0, right_context_1, ..., right_context_n]
+    For example, if we have utterance = [v0, v1, v2, ...., v20]. segment
+    size 8, right_context size 4. Then the right context blocks =
+    [v8, v9, v10, v11, v16, v17, v18, v19, 0, 0, 0, 0], where v8, v9, v10,
+    and v11 are the right context for first segment. v16, v17, v18 and v19
+    are the right context for second segment. 0, 0, 0 and 0 are right context
+    for the last segment.
+
+    utterance is corresponding to input embedding sequence
+
+    summary is concatenation of average of each segments. [summary_0,
+    summary_1, ..., ].
+
+    In augmented memory multihead attention, the query is [right_context,
+    utterance, summary], key is [memory, right_context, utterance]. Different
+    with AugmentedMemoryMultiheadAttentionBmm, memory here is passed from
+    previous attention layer. For the first attention layer, memory is average
+    of each segment.
+
+    Memory is a concatenation of memory from each segments in previous attention
+    layer. For example, current layer is i, then memory is [m_0, m_1, ..., m_n].
+    Each m_k is the output from seg_k in layer i-1.
+
+    args:
+        input_dim: input embedding dimension
+        num_heads: number of heads in multihead self-attention
+        dropout: attention dropout
+        std_scale: if std_scale is not None. The weak attention suppression is
+            turned on. For std_scale = 0.5, all the attention smaller than
+            mean + 0.5 * std will be suppressed.
+        scaled_init: whether to use scaled init for linear weight
+        tanh_on_mem: whether to use tanh on memory output
+        use_mem: whether to use memory or not. When max_memory_size is 0, then
+            we don't have memory anymore.
+        layer_index: current self-attention layer index that is used in depth
+            initialization
+        max_relative_position: max relative position used in relative position
+            embedding
+        rpe_old_option: To be compatible with previous model. The previous model
+            was trained with attention += attention + rpe. The correct equation
+            should be attention = attention + rpe
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        num_heads,
+        dropout=0.0,
+        std_scale=None,
+        scaled_init=False,
+        tanh_on_mem=False,
+        use_mem=True,
+        mini_batches=False,
+        negative_inf="-inf",
+        layer_index=-1,
+        max_relative_position=0,
+        rpe_old_option=True,
+    ):
+        if input_dim % num_heads:
+            raise ValueError(
+                "input_dim ({}) must be divisible by num_heads ({})".format(
+                    input_dim, num_heads
+                )
+            )
+
+        super().__init__()
+
+        embed_dim = input_dim
+        self.e2h_kv = torch.nn.Linear(input_dim, 2 * input_dim, bias=True)
+        self.e2h_q = torch.nn.Linear(input_dim, input_dim, bias=True)
+        self.rpe_old_option = rpe_old_option
+        if max_relative_position > 0:
+            self.use_rpe = True
+            self.rpe_k = RelativePositionEmbedding(
+                head_dim=input_dim // num_heads,
+                max_position=max_relative_position,
+            )
+            self.rpe_v = RelativePositionEmbedding(
+                head_dim=input_dim // num_heads,
+                max_position=max_relative_position,
+            )
+        else:
+            self.use_rpe = False
+            self.rpe_k = None
+            self.rpe_v = None
+        if scaled_init:
+            if layer_index == -1:
+                gain = 1.0 / math.sqrt(2)
+            else:
+                # https://arxiv.org/abs/2005.09684 depthwise initialization
+                # stablize the training greatly. Use depthwise initialization to
+                # replace incremental loss.
+                gain = 1.0 / math.sqrt(layer_index + 1)
+            torch.nn.init.xavier_uniform_(self.e2h_kv.weight, gain=gain)
+            torch.nn.init.xavier_uniform_(self.e2h_q.weight, gain=gain)
+
+        self.out_proj = torch.nn.Linear(embed_dim, embed_dim, bias=True)
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.std_scale = std_scale
+        self.use_mem = use_mem
+        self.mini_batches = mini_batches
+        self.negative_inf = negative_inf
+
+        if tanh_on_mem:
+            self.squash_mem = torch.tanh
+            self.nonlinear_squash_mem = True
+        else:
+            self.squash_mem = NoOp()
+            self.nonlinear_squash_mem = False
+
+    def prepare_qkv(
+        self,
+        input: Tensor,
+        mems: Tensor,
+        lengths: Tensor,
+        summary_length: int,
+        lc_length: int,
+    ):
+        # T: right_context length + utterance_length  + summary_length
+        T, B, D = input.shape
+        mem_length = mems.size(0)
+        utterance_length = torch.max(lengths)
+
+        right_context_blocks_length = T - utterance_length - summary_length
+        rc_block = input[:right_context_blocks_length, :, :]
+        utterance_block = input[right_context_blocks_length : T - summary_length, :, :]
+
+        if B == 1:
+            padding_mask = None
+        else:
+            klengths = lengths + mem_length + right_context_blocks_length + lc_length
+            padding_mask = lengths_to_padding_mask(lengths=klengths)
+
+        mem_rc_input = torch.cat([mems, rc_block, utterance_block], dim=0)
+
+        # In training lc_length = 0
+        key_length = mem_rc_input.size(0) + lc_length
+        rc_input_sum = input
+        q = self.e2h_q(rc_input_sum)
+        kv = self.e2h_kv(mem_rc_input)
+        k, v = kv.chunk(chunks=2, dim=2)
+        result_qkv = (q, k, v)
+        input_shape = (T, B, D)
+        result_lengths_info = (
+            mem_length,
+            utterance_length,
+            right_context_blocks_length,
+            key_length,
+        )
+        if padding_mask is not None:
+            assert padding_mask.size(0) == B
+            assert padding_mask.size(1) == key_length
+
+        return result_qkv, input_shape, result_lengths_info, padding_mask
+
+    def prepare_attention_weights(
+        self,
+        q: Tensor,
+        new_k: Tensor,
+        new_v: Tensor,
+        input_shape: Tuple[int, int, int],
+        rpe: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        T, B, D = input_shape
+        q = (
+            q.contiguous().view(-1, B * self.num_heads, self.head_dim).transpose(0, 1)
+            * self.scaling
+        )
+
+        k = (
+            new_k.contiguous()
+            .view(-1, B * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+
+        v = (
+            new_v.contiguous()
+            .view(-1, B * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+
+        attention_weights = torch.bmm(q, k.transpose(1, 2))
+        if self.use_rpe and rpe is not None and self.rpe_v is not None:
+            r_k = self.rpe_k(rpe)
+            # [q, B*h, d] * [q, k, d] -> [B*h, q, k]
+            attention_weights_rpe = torch.matmul(
+                q.transpose(0, 1), r_k.transpose(1, 2)
+            ).transpose(0, 1)
+            attention_weights = attention_weights + attention_weights_rpe
+        attention_weights_float = attention_weights.float()
+
+        return attention_weights, attention_weights_float, v
+
+    def prepare_attention_output(
+        self,
+        attention_weights: Tensor,
+        attention_weights_float: Tensor,
+        v: Tensor,
+        input_shape: Tuple[int, int, int],
+        key_length: int,
+        padding_mask: Optional[Tensor],
+        rpe: Optional[Tensor],
+    ) -> Tensor:
+        T, B, D = input_shape
+        if padding_mask is not None:
+            attention_weights_float = attention_weights_float.view(
+                B, self.num_heads, T, key_length
+            )
+            attention_weights_float = attention_weights_float.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attention_weights_float = attention_weights_float.view(
+                B * self.num_heads, T, key_length
+            )
+
+        if self.std_scale is not None:
+            attention_weights_float = attention_suppression(
+                attention_weights_float, self.std_scale
+            )
+
+        attention_weights_float = torch.nn.functional.softmax(
+            attention_weights_float, dim=-1
+        )
+        attention_weights = attention_weights_float.type_as(attention_weights)
+
+        attention_probs = torch.nn.functional.dropout(
+            attention_weights, p=self.dropout, training=self.training
+        )
+
+        # [T, key_length, B, n_head]+ [key_length, B, n_head, d_head]
+        # -> [T, B, n_head, d_head]
+        attention = torch.bmm(attention_probs, v)
+        if self.use_rpe and rpe is not None and self.rpe_v is not None:
+            r_v = self.rpe_v(rpe)
+            attention_rpe = torch.matmul(
+                attention_probs.transpose(0, 1), r_v
+            ).transpose(0, 1)
+
+            if self.rpe_old_option:
+                attention += attention + attention_rpe
+            else:
+                attention = attention + attention_rpe
+
+        assert list(attention.shape) == [B * self.num_heads, T, self.head_dim]
+
+        attention = attention.transpose(0, 1).contiguous().view(T, B, self.embed_dim)
+
+        rc_output_memory = self.out_proj(attention)
+        return rc_output_memory
+
+    @torch.jit.unused
+    def forward(
+        self,
+        input: Tensor,
+        lengths: Tensor,
+        mems: Tensor,
+        attention_mask: Tensor,
+        pre_mems: Optional[Tensor] = None,
+        left_context_key: Optional[Tensor] = None,
+        left_context_val: Optional[Tensor] = None,
+        rpe: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """
+        forward function for NoSegAugmentedMemoryMultiheadAttentionBmm in training.
+
+        args:
+            input: formed in the following way
+                [right_context_0, right_contex_1, ..., seg_0, seg_1,
+                ..., summary_0, summary_1,..]
+            lengths: the length of query which is [seg_0, seg_1, ....]
+            mems: [mem_0, mem_1, ...].
+            attention_mask: attention mask for query = [right_context, query, summary]
+                key = [mem, right_context, query]. This is only used for traing.
+
+        """
+        if self.use_mem:
+            mem_length = mems.size(0)
+            summary_length = mem_length + 1
+            if pre_mems is not None:
+                mems = torch.cat([pre_mems, mems], dim=0)
+        else:
+            mem_length = 0
+            summary_length = 0
+
+        # In training, lc_length = 0
+        if left_context_key is not None:
+            lc_length = left_context_key.size(0)
+        else:
+            lc_length = 0
+        results = self.prepare_qkv(
+            input=input,
+            mems=mems,
+            lengths=lengths,
+            summary_length=summary_length,
+            lc_length=lc_length,
+        )
+        result_qkv, input_shape, result_lengths_info, padding_mask = results
+        q, k, v = result_qkv
+        (
+            mem_length,
+            utterance_length,
+            right_context_blocks_length,
+            key_length,
+        ) = result_lengths_info
+
+        if left_context_key is not None:
+            # add the cache key and value
+            new_k = torch.cat(
+                [
+                    k[: mem_length + right_context_blocks_length, :, :],
+                    left_context_key,
+                    k[-utterance_length:, :, :],
+                ],
+                dim=0,
+            )
+            new_v = torch.cat(
+                [
+                    v[: mem_length + right_context_blocks_length, :, :],
+                    left_context_val,
+                    v[-utterance_length:, :, :],
+                ],
+                dim=0,
+            )
+            next_k = new_k[mem_length + right_context_blocks_length :, :, :]
+            next_v = new_v[mem_length + right_context_blocks_length :, :, :]
+        else:
+            new_k = k
+            new_v = v
+            next_k = None
+            next_v = None
+
+        attention_weights, attention_weights_float, v = self.prepare_attention_weights(
+            q=q,
+            new_k=new_k,
+            new_v=new_v,
+            input_shape=input_shape,
+            rpe=rpe,
+        )
+
+        # mask attention
+        attention_mask = attention_mask.unsqueeze(0)
+        attention_weights_float = attention_weights_float.masked_fill(
+            attention_mask, float(self.negative_inf)
+        )
+
+        rc_output_memory = self.prepare_attention_output(
+            attention_weights=attention_weights,
+            attention_weights_float=attention_weights_float,
+            v=v,
+            input_shape=input_shape,
+            key_length=key_length,
+            padding_mask=padding_mask,
+            rpe=rpe,
+        )
+
+        if self.use_mem:
+            # next_m length equals to summary length - 1
+            # last memory is ignored
+            if self.mini_batches:
+                next_m = rc_output_memory[-summary_length:]
+            else:
+                next_m = rc_output_memory[-summary_length:-1]
+
+            next_m = self.squash_mem(next_m)
+            # rc and output
+            rc_output = rc_output_memory[:-summary_length]
+            if not self.nonlinear_squash_mem:
+                next_m = torch.clamp(next_m, min=-10, max=10)
+        else:
+            next_m = mems
+            rc_output = rc_output_memory
+
+        return rc_output, next_m, next_k, next_v
+
+    @torch.jit.export
+    def forward_jit(
+        self,
+        input: Tensor,
+        lengths: Tensor,
+        mems: Tensor,
+        left_context_key: Tensor,
+        left_context_val: Tensor,
+        rpe: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """
+        forward function for NoSegAugmentedMemoryMultiheadAttentionBmm in decoding.
+
+        args:
+            input: formed in the following way
+                [right_context_0, right_contex_1, ..., seg_0, seg_1,
+                ..., summary_0, summary_1,..]
+            lengths: the length of query which is [seg_0, seg_1, ....]
+            mems: [mem_0, mem_1, ...].
+            left_context_key: left_context for key part. This is only used for online
+                decoding. In training, this is empty tensor
+            left_context_val: left_context for value part. This is only used for online
+                decoding. In training, this is empty tensor
+
+        """
+        lc_length = left_context_key.size(0)
+
+        # In decoding, summary_length = 1 or 0
+        if self.use_mem:
+            summary_length = 1
+        else:
+            summary_length = 0
+
+        results = self.prepare_qkv(
+            input=input,
+            mems=mems,
+            lengths=lengths,
+            summary_length=summary_length,
+            lc_length=lc_length,
+        )
+        result_qkv, input_shape, result_lengths_info, padding_mask = results
+        q, k, v = result_qkv
+        (
+            mem_length,
+            utterance_length,
+            right_context_blocks_length,
+            key_length,
+        ) = result_lengths_info
+
+        # add the cache key and value
+        new_k = torch.cat(
+            [
+                k[: mem_length + right_context_blocks_length, :, :],
+                left_context_key,
+                k[-utterance_length:, :, :],
+            ],
+            dim=0,
+        )
+        new_v = torch.cat(
+            [
+                v[: mem_length + right_context_blocks_length, :, :],
+                left_context_val,
+                v[-utterance_length:, :, :],
+            ],
+            dim=0,
+        )
+        next_k = new_k[mem_length + right_context_blocks_length :, :, :]
+        next_v = new_v[mem_length + right_context_blocks_length :, :, :]
+
+        attention_weights, attention_weights_float, v = self.prepare_attention_weights(
+            q=q,
+            new_k=new_k,
+            new_v=new_v,
+            input_shape=input_shape,
+            rpe=rpe,
+        )
+        # In online decoding, we don't have attention mask. But we still need
+        # to disable the attention from summary query to memory
+        attention_weights_float[:, -1, :mem_length] = float(self.negative_inf)
+        rc_output_memory = self.prepare_attention_output(
+            attention_weights=attention_weights,
+            attention_weights_float=attention_weights_float,
+            v=v,
+            input_shape=input_shape,
+            key_length=key_length,
+            padding_mask=padding_mask,
+            rpe=rpe,
+        )
+
+        # In decoding, summary length is 1
+        if self.use_mem:
+            next_m = rc_output_memory[-1:]
+            next_m = self.squash_mem(next_m)
+            # rc and output
+            rc_output = rc_output_memory[:-1]
+            if not self.nonlinear_squash_mem:
+                next_m = torch.clamp(next_m, min=-10, max=10)
+        else:
+            rc_output = rc_output_memory
+            # empty tensor as input mems
+            next_m = mems
+
+        return rc_output, next_m, next_k, next_v
+
+    def quantize_(self, params=None):
+        if params and "per_channel" in params and params["per_channel"]:
+            qconfig = per_channel_dynamic_qconfig
+        else:
+            qconfig = default_dynamic_qconfig
+        quantization.quantize_dynamic(
+            self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
+        )
+        return self
+
+
+class NoSegAugmentedMemoryTransformer(nn.Module):
+    """
+    Whole utterance augmented memory transformer.
+
+    This is not pyspeech nn layer. It is used as a module in a master layer where
+    multiple transformers is used.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        num_heads,
+        ffn_dim,
+        dropout_in_attn=0.0,
+        dropout_on_attn=None,
+        dropout_on_fc1=None,
+        dropout_on_fc2=None,
+        activation_fn="relu",
+        tanh_on_mem=False,
+        std_scale=None,
+        scaled_init=False,
+        segment_size=128,
+        use_mem=True,
+        mini_batches=False,
+        negative_inf="-inf",
+        layer_index=-1,
+        summarization_method="mean",
+        max_relative_position=0,
+        rpe_old_option=True,
+    ):
+        super(NoSegAugmentedMemoryTransformer, self).__init__()
+
+        self.attention = NoSegAugmentedMemoryMultiheadAttentionBmm(
+            input_dim=input_dim,
+            num_heads=num_heads,
+            dropout=dropout_in_attn,
+            scaled_init=scaled_init,
+            tanh_on_mem=tanh_on_mem,
+            std_scale=std_scale,
+            use_mem=use_mem,
+            mini_batches=mini_batches,
+            negative_inf=negative_inf,
+            layer_index=layer_index,
+            max_relative_position=max_relative_position,
+        )
+        self.dropout = nn.Dropout(dropout_on_attn)
+        self.pos_ff = PositionwiseFF(
+            input_dim=input_dim,
+            ffn_dim=ffn_dim,
+            dropout_on_fc1=dropout_on_fc1,
+            dropout_on_fc2=dropout_on_fc2,
+            activation_fn=activation_fn,
+        )
+        self.layer_norm_pre = Fp32LayerNorm(input_dim)
+        self.layer_norm = Fp32LayerNorm(input_dim)
+        self.segment_size = segment_size
+        self.use_mem = use_mem
+
+        self.memory_op = SummarizationLayer(
+            summarization_method, segment_size, input_dim
+        )
+
+    def set_mini_batches(self, mini_batches):
+        self.attention.mini_batches = mini_batches
+
+    def gen_summary_queries(self, input):
+        sum_input = self.memory_op(input)
+        return sum_input
+
+    def pre_attention_ops(self, input, right_context_blocks):
+        rc_length = right_context_blocks.size(0)
+        input_length = input.size(0)
+
+        rc_and_input = torch.cat([right_context_blocks, input], dim=0)
+        residual_input = rc_and_input
+        rc_and_input = self.layer_norm_pre(rc_and_input)
+
+        query_input = rc_and_input[-input_length:, :, :]
+        return rc_length, input_length, residual_input, query_input, rc_and_input
+
+    def after_attention_ops(self, attention_output, residual_input):
+        output = self.dropout(attention_output)
+        output = output + residual_input
+        output = self.pos_ff(output)
+        output = self.layer_norm(output)
+        return output
+
+    @torch.jit.export
+    def forward_jit(
+        self,
+        input: Tensor,
+        lengths: Tensor,
+        mems: Tensor,
+        left_context_key: Tensor,
+        left_context_val: Tensor,
+        right_context_blocks: Tensor,
+        rpe: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+
+        results = self.pre_attention_ops(input, right_context_blocks)
+        rc_length, input_length, residual_input, query_input, rc_and_input = results
+
+        # In online decoding, the summary query size is always 1 or 0
+        if self.use_mem:
+            summary_query = self.gen_summary_queries(query_input)
+            summary_query = summary_query[0:1, :, :]
+            rc_qu_su = torch.cat([rc_and_input, summary_query], dim=0)
+        else:
+            rc_qu_su = rc_and_input
+
+        rc_output, next_m, next_k, next_v = self.attention.forward_jit(
+            input=rc_qu_su,
+            lengths=lengths,
+            mems=mems,
+            left_context_key=left_context_key,
+            left_context_val=left_context_val,
+            rpe=rpe,
+        )
+        rc_output = self.after_attention_ops(rc_output, residual_input)
+        results = (
+            rc_output[-input_length:, :, :],
+            next_m,
+            rc_output[0:rc_length, :, :],
+            next_k,
+            next_v,
+        )
+        return results
+
+    @torch.jit.unused
+    def forward(
+        self,
+        input,
+        lengths,
+        mems,
+        right_context_blocks,
+        attention_mask,
+        pre_mems,
+        left_context_key,
+        left_context_val,
+        rpe,
+    ):
+
+        results = self.pre_attention_ops(input, right_context_blocks)
+        rc_length, input_length, residual_input, query_input, rc_and_input = results
+        if self.use_mem:
+            summary_query = self.gen_summary_queries(query_input)
+            rc_qu_su = torch.cat([rc_and_input, summary_query], dim=0)
+        else:
+            rc_qu_su = rc_and_input
+
+        rc_output, next_m, next_k, next_v = self.attention(
+            input=rc_qu_su,
+            lengths=lengths,
+            mems=mems,
+            attention_mask=attention_mask,
+            pre_mems=pre_mems,
+            left_context_key=left_context_key,
+            left_context_val=left_context_val,
+            rpe=rpe,
+        )
+
+        # [TODO] Note memory did not go through pos_ff. What happen if we pass
+        # memory through the pos_ff as well?
+        rc_output = self.after_attention_ops(rc_output, residual_input)
+        results = (
+            rc_output[-input_length:, :, :],
+            next_m,
+            rc_output[0:rc_length, :, :],
+            next_k,
+            next_v,
+        )
+
+        return results
+
+
+class NoSegAugmentedMemoryTransformerEncoderLayer(FairseqEncoder):
+    """
+    Whole utterance augmented memory transformer encoder layer. This is a master layer
+    where we can define multiple augmented memory transformers. There are two reasons
+    to setup the master layer.
+    1. We only need to define once about the attention mask. All the layers in the master
+       layer share the same mask.
+    2. pyspeech nn layer has special input and output format. Defining one master layer is
+       easier to passing memory between different layes inside the master layer
+
+    args:
+        input_dim: input embedding dimension
+        num_heads: number of heads in multihead self-attention
+        ffn_dim: ffn dimension in FFN layer
+        num_layers: number of augmented memory transformer layers
+        dropout_in_attn: dropout used in multi-head self-attention
+        dropout_on_attn: dropout used for output from te multihead self-attention
+        dropout_on_fc1: dropout used in FFN layer for the first linear layer
+        dropout_on_fc2: dropout used in FFN layer for the second linear layer
+        segment_size: segment size for each segment
+        context_config: (left_context_size, right_context_size) defines the surround context size
+            for each segment
+        max_memory_size: maximum memory size used for each segment
+        scaled_init: whether use scaled init for weight initialization in attention layer
+        std_scale: if std_scale is not None. The weak attention suppression is
+            turned on. For std_scale = 0.5, all the attention smaller than
+            mean + 0.5 * std will be suppressed.
+        activation_fn: activation function used in FFN layer. [ReLU, GELU] supported
+        tanh_on_mem: whether use tanh on memory
+        mini_batches: use mini-btach training
+        negative_inf: the negative infinity value used in attention masking. default is "-inf".
+            For some situation, e.g. LM. it is better to use "-1e8" to avoid nan issue.
+        summarization_method: method to generate segment summrization embedding
+        max_relative_position: max relatie position for relative position embedding
+        rpe_old_option: To be compatible with previous model. The previous model
+            was trained with attention += attention + rpe. The correct equation
+            should be attention = attention + rpe
+        [TODO]: remove the rpe_old_option by the end of 2021 Q1.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        num_heads,
+        ffn_dim,
+        num_layers=1,
+        dropout_in_attn=0.0,
+        dropout_on_attn=0.0,
+        dropout_on_fc1=0.0,
+        dropout_on_fc2=0.0,
+        segment_size=128,
+        context_config=(0, 0),
+        max_memory_size=0,
+        scaled_init=True,
+        std_scale=None,
+        activation_fn="relu",
+        tanh_on_mem=False,
+        mini_batches=False,
+        negative_inf="-inf",
+        deep_init=True,
+        summarization_method="mean",
+        max_relative_position=0,
+        rpe_old_option=True,
+    ):
+        super().__init__(None)
+        if input_dim % num_heads:
+            raise ValueError(
+                "input_dim ({}) must be divisible by num_heads ({})".format(
+                    input_dim, num_heads
+                )
+            )
+
+        # we used to support growing memory size. However, it will cause
+        # cross stream batching failure. Now we need to have exact max memory size
+        if max_memory_size < 0:
+            raise ValueError("max_memory_size must be >= 0")
+
+        # Only assign right_context. In decoding, left context will be cached.
+        # No need to let the online decoder to re-assign the left context
+        self.left_context, self.right_context = context_config
+        self.segment_size = segment_size
+        self.memory_dim = input_dim
+        self.max_memory_size = max_memory_size
+        self.mini_batches = mini_batches
+        if self.max_memory_size != 0:
+            self.use_mem = True
+        else:
+            self.use_mem = False
+
+        self.memory_op = SummarizationLayer(
+            summarization_method, segment_size, input_dim
+        )
+
+        self.layers = torch.nn.ModuleList()
+        self.num_layers = num_layers
+        self.max_relative_position = max_relative_position
+        if self.max_relative_position > 0:
+            self.use_rpe = True
+        else:
+            self.use_rpe = False
+        for i in range(self.num_layers):
+            if deep_init:
+                layer_index = i
+            else:
+                layer_index = -1
+
+            self.layers.append(
+                NoSegAugmentedMemoryTransformer(
+                    num_heads=num_heads,
+                    input_dim=input_dim,
+                    ffn_dim=ffn_dim,
+                    dropout_in_attn=dropout_in_attn,
+                    dropout_on_attn=dropout_on_attn,
+                    dropout_on_fc1=dropout_on_fc1,
+                    dropout_on_fc2=dropout_on_fc2,
+                    segment_size=segment_size,
+                    std_scale=std_scale,
+                    activation_fn=activation_fn,
+                    tanh_on_mem=tanh_on_mem,
+                    scaled_init=scaled_init,
+                    use_mem=self.use_mem,
+                    mini_batches=mini_batches,
+                    negative_inf=negative_inf,
+                    layer_index=layer_index,
+                    summarization_method=summarization_method,
+                    max_relative_position=max_relative_position,
+                    rpe_old_option=rpe_old_option,
+                )
+            )
+
+    def set_mini_batches(self, mini_batches):
+        # handy function only used for unit test
+        self.mini_batches = mini_batches
+        for layer in self.layers:
+            layer.set_mini_batches(mini_batches)
+
+    def _get_relative_position(
+        self,
+        input: Tensor,
+        max_relative_position: int,
+        left_context_length: int,
+        past_length: int,
+        is_decoding: bool,
+    ):
+        # For training, we copy the right context to the start of the utterance
+        # First dimension in distance is corresponding to query.
+        # [right context, utterance, summary vector]
+        # Second dimension in distance is corresponding to key.
+        # [Memory bank, right context, utterance]
+        # For summary vector in query part, the distance with
+        # all other position is 2*max_position. For memory bank in key,
+        # the distance with all other positions is 0.
+
+        T, B, D = input.shape
+        num_segs = math.ceil((T - self.right_context) / self.segment_size)
+
+        # utterance
+        u_st = past_length * self.segment_size
+        u_ed = u_st + T
+        utterance_ranges = torch.arange(u_st, u_ed - self.right_context)
+
+        # left context. Only in minibatch or decoding
+        left_context_ranges = torch.arange(u_st - left_context_length, u_st)
+
+        # Right context block
+        # right context + utterance
+        right_context_blocks = []
+        for i in range(0, num_segs - 1):
+            st = (i + 1) * self.segment_size + u_st
+            ed = st + self.right_context
+            assert ed < u_ed
+            temp = torch.arange(st, ed)
+            right_context_blocks.append(temp)
+        right_context_blocks.append(torch.arange(u_ed - self.right_context, u_ed))
+        right_context_ranges = torch.cat(right_context_blocks)
+
+        if self.use_mem:
+            # Memory bank
+            # The position for memory -n, .., -1
+            if is_decoding:
+                memory_size = min(past_length, self.max_memory_size)
+            else:
+                memory_size = num_segs + past_length - 1
+            memory_bank_ranges = torch.arange(
+                -max_relative_position - 1, -max_relative_position - 1 - memory_size, -1
+            )
+
+            # summary vector
+            # The position for summary vector as the T+max_relative_position+1.
+            # After the clamping, the relative position is max_relative_position
+            summary_pos_st = u_ed + max_relative_position + 1
+            summary_vector_ranges = torch.arange(
+                summary_pos_st, summary_pos_st + num_segs
+            )
+
+            key_ranges = torch.cat(
+                [
+                    memory_bank_ranges,
+                    right_context_ranges,
+                    left_context_ranges,
+                    utterance_ranges,
+                ]
+            )
+
+            query_ranges = torch.cat(
+                [right_context_ranges, utterance_ranges, summary_vector_ranges]
+            )
+        else:
+            key_ranges = torch.cat(
+                [right_context_ranges, left_context_ranges, utterance_ranges]
+            )
+
+            query_ranges = torch.cat([right_context_ranges, utterance_ranges])
+
+        distance = key_ranges[None, :] - query_ranges[:, None]
+        distance_clamp = (
+            torch.clamp(distance, -max_relative_position, max_relative_position)
+            + max_relative_position
+        )
+        distance_clamp = distance_clamp.to(input.device).long().detach()
+        return distance_clamp
+
+    def _get_attention_mask(self, input, past_length=0, left_context_cache=0):
+        # attention mask for each query contains three parts:
+        # 1. memory part
+        # 2. left_context + segment
+        # 3. right_context_block
+        # so for each segment and its correspoinding right context block,
+        # the attention matrix is formed by 9 parts:
+        # [0, m, 0, 0, right_context, 0, 0, seg, 0]
+        # [before memory, memory, after memory, before right context, right_context,
+        #  after right context, before seg, seg, after seg]
+        #
+        # Query is formed in the way as [right_context_blocks, utterance, summary]
+        #
+        # Note: put m and right_context before segment is convenient
+        # for padding_mask operation.
+        # Key lengths = m_length + right_context_block_length + lengths
+        utterance_length, batch_size, _ = input.shape
+        summary_length = math.ceil(utterance_length / self.segment_size)
+        num_segs = summary_length
+        rc_length = self.right_context * num_segs
+        rc = self.right_context
+        lc = self.left_context
+
+        # using mini-batches, there is left context cache available for current
+        # sequence.
+        lcc = left_context_cache
+
+        # max_memory_size is 0 then we don't have memory and summary
+        # past_length is the memory carry from previous sequence
+        if self.use_mem:
+            mem_length = num_segs - 1 + past_length
+        else:
+            mem_length = 0
+        rc_mask = []
+        query_mask = []
+        summary_mask = []
+        for j in range(0, num_segs):
+            ssize = min(self.segment_size, utterance_length - j * self.segment_size)
+
+            rc_size = rc
+            rc_mat = []
+            q_mat = []
+            s_mat = []
+            m_start = max(j + past_length - self.max_memory_size, 0)
+
+            # max_memory_size is 0, then we don't use memory
+            if self.use_mem:
+                # part 0: before memory
+                rc_mat.append(input.new_zeros(rc_size, m_start))
+                q_mat.append(input.new_zeros(ssize, m_start))
+                s_mat.append(input.new_zeros(1, m_start))
+
+                # part 1: memory
+                col_1 = j + past_length - m_start
+                rc_mat.append(torch.ones(rc_size, col_1, device=input.device))
+                q_mat.append(torch.ones(ssize, col_1, device=input.device))
+                # based on D22875746, disable summary query attention
+                # on memeory is better for long form utterance
+                s_mat.append(input.new_zeros(1, col_1))
+
+                # part 2: after memory
+                col_2 = mem_length - (j + past_length)
+                rc_mat.append(input.new_zeros(rc_size, col_2))
+                q_mat.append(input.new_zeros(ssize, col_2))
+                s_mat.append(input.new_zeros(1, col_2))
+
+            # part 3: before right context
+            rc_start = j * rc
+            rc_mat.append(input.new_zeros(rc_size, rc_start))
+            q_mat.append(input.new_zeros(ssize, rc_start))
+            s_mat.append(input.new_zeros(1, rc_start))
+
+            # part 4: right context
+            rc_end = rc_start + rc
+            col_4 = rc
+            rc_mat.append(torch.ones(rc_size, col_4, device=input.device))
+            q_mat.append(torch.ones(ssize, col_4, device=input.device))
+            s_mat.append(torch.ones(1, col_4, device=input.device))
+
+            # part 5: after right context
+            col_5 = rc_length - rc_end
+            rc_mat.append(input.new_zeros(rc_size, col_5))
+            q_mat.append(input.new_zeros(ssize, col_5))
+            s_mat.append(input.new_zeros(1, col_5))
+
+            # part 6: before query segment
+            seg_start = max(j * self.segment_size + lcc - lc, 0)
+            rc_mat.append(input.new_zeros(rc_size, seg_start))
+            q_mat.append(input.new_zeros(ssize, seg_start))
+            s_mat.append(input.new_zeros(1, seg_start))
+
+            # part 7: query segment
+            # note: right context is put in right context block
+            # here we only need to consider about left context
+            seg_end = min((j + 1) * self.segment_size + lcc, utterance_length + lcc)
+            col_7 = seg_end - seg_start
+            rc_mat.append(torch.ones(rc_size, col_7, device=input.device))
+            q_mat.append(torch.ones(ssize, col_7, device=input.device))
+            s_mat.append(torch.ones(1, col_7, device=input.device))
+
+            # part 8: after query segment
+            col_8 = utterance_length + lcc - seg_end
+            rc_mat.append(input.new_zeros(rc_size, col_8))
+            q_mat.append(input.new_zeros(ssize, col_8))
+            s_mat.append(input.new_zeros(1, col_8))
+
+            rc_mask.append(torch.cat(rc_mat, dim=1))
+            query_mask.append(torch.cat(q_mat, dim=1))
+            summary_mask.append(torch.cat(s_mat, dim=1))
+
+        # no memory, then we don't need summary either
+        if self.use_mem:
+            attention_mask = (
+                1
+                - torch.cat(
+                    [
+                        torch.cat(rc_mask, dim=0),
+                        torch.cat(query_mask, dim=0),
+                        torch.cat(summary_mask, dim=0),
+                    ],
+                    dim=0,
+                )
+            ).to(torch.bool)
+        else:
+            attention_mask = (
+                1
+                - torch.cat(
+                    [torch.cat(rc_mask, dim=0), torch.cat(query_mask, dim=0)], dim=0
+                )
+            ).to(torch.bool)
+
+        return attention_mask
+
+    @torch.jit.export
+    def init_state(
+        self, batch_size: int, device: Optional[Device] = None
+    ) -> List[Tensor]:
+        empty_memory = torch.zeros(
+            self.num_layers,
+            self.max_memory_size,
+            batch_size,
+            self.memory_dim,
+            device=device,
+        )
+        left_context_key = torch.zeros(
+            self.num_layers,
+            self.left_context,
+            batch_size,
+            self.memory_dim,
+            device=device,
+        )
+        left_context_val = torch.zeros(
+            self.num_layers,
+            self.left_context,
+            batch_size,
+            self.memory_dim,
+            device=device,
+        )
+        past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device)
+
+        return [empty_memory, left_context_key, left_context_val, past_length]
+
+    @torch.jit.export
+    def batch_state(self, states: List[List[Tensor]]) -> List[Tensor]:
+        if len(states) == 0:
+            return []
+        batched_m = []
+        batched_lc_key = []
+        batched_lc_val = []
+        batched_past_length = []
+        for state in states:
+            if len(state) == 0:
+                continue
+            m, lc_key, lc_val, past_length = state
+            batched_m.append(m)
+            batched_lc_key.append(lc_key)
+            batched_lc_val.append(lc_val)
+            batched_past_length.append(past_length)
+
+        if (
+            (len(batched_m) == 0)
+            or (len(batched_lc_key) == 0)
+            or (len(batched_lc_val) == 0)
+            or (len(batched_past_length) == 0)
+        ):
+            return [
+                torch.tensor([]),
+                torch.tensor([]),
+                torch.tensor([]),
+                torch.tensor([]),
+            ]
+
+        batched_m = torch.cat(batched_m, dim=2)
+        batched_lc_key = torch.cat(batched_lc_key, dim=2)
+        batched_lc_val = torch.cat(batched_lc_val, dim=2)
+        batched_past_length = torch.cat(batched_past_length, dim=1)
+        return [batched_m, batched_lc_key, batched_lc_val, batched_past_length]
+
+    @torch.jit.export
+    def reorder_state(self, state: List[Tensor], indices: Tensor) -> List[Tensor]:
+        if len(state) == 0:
+            return []
+        m, lc_key, lc_val, past_length = state
+        indices = indices.to(device=m.device)
+        reord_m = torch.index_select(m, 2, indices)
+        reord_lc_key = torch.index_select(lc_key, 2, indices)
+        reord_lc_val = torch.index_select(lc_val, 2, indices)
+        reord_past_length = torch.index_select(past_length, 1, indices)
+        return [reord_m, reord_lc_key, reord_lc_val, reord_past_length]
+
+    @torch.jit.export
+    def reset_state(self, state: List[Tensor], indices: Tensor) -> List[Tensor]:
+        m, lc_key, lc_val, past_length = state
+        m = m.index_fill(dim=2, index=indices, value=0.0)
+        lc_key = lc_key.index_fill(dim=2, index=indices, value=0.0)
+        lc_val = lc_val.index_fill(dim=2, index=indices, value=0.0)
+        past_length = past_length.index_fill(dim=1, index=indices, value=0)
+
+        return [m, lc_key, lc_val, past_length]
+
+    @torch.jit.export
+    def state_size(self) -> int:
+        return 4
+
+    @torch.jit.export
+    def batch_size_in_state(
+        self, state: Optional[List[Tensor]], sloppy: bool = True
+    ) -> Optional[int]:
+        if state is None:
+            return None
+        return state[0].size(2)
+
+    def gen_summary_queries(self, input):
+        sum_input = self.memory_op(input)
+        return sum_input
+
+    def _gen_right_context_padded_input(self, input):
+        # This function deals with input that is already
+        # padded with right context (e.g. minibatch training)
+        right_context_blocks = []
+        T, B, D = input.shape
+        num_segs = math.ceil((T - self.right_context) / self.segment_size)
+        for i in range(0, num_segs - 1):
+            st = (i + 1) * self.segment_size
+            ed = st + self.right_context
+            assert ed < T
+            temp = input[st:ed, :, :]
+            right_context_blocks.append(temp)
+
+        # last segment right context is already available
+        right_context_blocks.append(input[T - self.right_context :, :, :])
+        return torch.cat(right_context_blocks, dim=0)
+
+    def _gen_segs_right_context(self, input, lengths):
+        segments = []
+        T, B, D = input.size()
+        nT = T - self.right_context
+
+        # assume input is right context padded
+        num_segs = math.ceil(nT / self.segment_size)
+        # pad zeros to the utterance to make sure each
+        # segment has the same right context. For the
+        for i in range(0, num_segs - 1):
+            st = i * self.segment_size
+            ed = min(T, st + self.segment_size + self.right_context)
+            temp = input[st:ed, :, :]
+            rest_lengths = torch.clamp(
+                lengths - self.segment_size, min=0, max=nT - (i + 1) * self.segment_size
+            )
+            segments.append((temp, lengths - rest_lengths + self.right_context))
+            lengths = rest_lengths
+
+        last_seg = input[st + self.segment_size :, :, :]
+        segments.append((last_seg, rest_lengths + self.right_context))
+
+        return segments
+
+    @torch.jit.unused
+    def forward(
+        self, input: Tensor, padding_masks: Tensor, state: Optional[List[Tensor]] = None
+    ) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]:
+        # Xutai: originally the second argument is lengths.
+        lengths = (~padding_masks).sum(dim=1).long()
+        # mini batch training.
+        if self.mini_batches:
+            return self.forward_mini_batches(input, lengths, state)
+
+        # regular full sequence training. Note, assume the right context in provided
+        # in the input.
+        T, B, D = input.size()
+        right_context_blocks = self._gen_right_context_padded_input(input)
+
+        # generate the relative positional embedding
+        if self.use_rpe:
+            rpe = self._get_relative_position(
+                input=input,
+                max_relative_position=self.max_relative_position,
+                left_context_length=0,
+                past_length=0,
+                is_decoding=False,
+            )
+        else:
+            rpe = None
+        input = input[: T - self.right_context, :, :]
+
+        attention_mask = self._get_attention_mask(input)
+
+        # firt layer use each segment mean as memory
+        # ignore the last one seg average
+        if self.use_mem:
+            mems = self.gen_summary_queries(input)[:-1, :, :]
+        else:
+            mems = torch.zeros(0, input.size(1), input.size(2), device=input.device)
+            mems = mems.type_as(input)
+
+        output = input
+        all_outputs = []
+
+        for layer in self.layers:
+            output, mems, right_context_blocks, _, _ = layer(
+                input=output,
+                lengths=lengths,
+                attention_mask=attention_mask,
+                mems=mems,
+                right_context_blocks=right_context_blocks,
+                pre_mems=None,
+                left_context_key=None,
+                left_context_val=None,
+                rpe=rpe,
+            )
+            all_outputs.append(output)
+        return output, padding_masks, [], all_outputs
+
+    def forward_jit_mini_batch_init(
+        self,
+        seg: Tensor,
+        state: Optional[List[Tensor]] = None,
+        is_decoding: bool = False,
+    ):
+        # Prepare state. In whole sequence training, state is ignored.
+        # For minibatch training, we need to prepare state
+        if state is None:
+            state = self.init_state(batch_size=seg.size(1), device=seg.device)
+            if seg.dtype == torch.half:
+                state = [state[0].half(), state[1].half(), state[2].half(), state[3]]
+
+        if self.use_mem:
+            # note input average only on seg, not on right context
+            # first layer use each segmetn mean as memory. the last
+            # one segment average is used in state
+            full_mems = self.gen_summary_queries(seg)
+            if is_decoding:
+                mems = full_mems[0:1, :, :]
+                state_mems = torch.cat([state[0][0], mems], dim=0)
+            else:
+                mems = full_mems[:-1, :, :]
+                state_mems = torch.cat([state[0][0], full_mems], dim=0)
+        else:
+            mems = state[0][0]
+            state_mems = mems
+
+        # track processed segment number or memory number
+        # the same batch as the same bumber of past length
+        past_length = state[3][0][0].item()
+        past_left_context = min(past_length * self.segment_size, self.left_context)
+        past_length = min(self.max_memory_size, past_length)
+
+        return state, mems, state_mems, past_length, past_left_context
+
+    def state_update_before(
+        self, layer: int, state: List[Tensor], past_length: int, past_left_context: int
+    ):
+        pre_mems = state[0][layer][self.max_memory_size - past_length :, :, :]
+        lc_key = state[1][layer][self.left_context - past_left_context :, :, :]
+        lc_val = state[2][layer][self.left_context - past_left_context :, :, :]
+        return pre_mems, lc_key, lc_val
+
+    def state_update_after(
+        self,
+        layer: int,
+        state: List[Tensor],
+        mems: Tensor,
+        next_key: Tensor,
+        next_val: Tensor,
+        mems_list: List[Tensor],
+        lc_key_list: List[Tensor],
+        lc_val_list: List[Tensor],
+    ):
+        # mems is used for next layer
+        if layer < self.num_layers - 1:
+            state_mems = torch.cat([state[0][layer + 1], mems], dim=0)
+            mems_list.append(state_mems[-self.max_memory_size :, :, :])
+
+        # when mems pass to next sequence, we need the last memory. when mems
+        # use for the next layer, we can ignore the last memory
+        mems = mems[:-1, :, :]
+
+        # note state[1][i] and state[2][i] original length equals to self.left_context
+        new_k = torch.cat([state[1][layer], next_key], dim=0)
+        new_v = torch.cat([state[2][layer], next_val], dim=0)
+        lc_key_list.append(new_k[-self.left_context :, :, :])
+        lc_val_list.append(new_v[-self.left_context :, :, :])
+        return mems_list, lc_key_list, lc_val_list, mems
+
+    def state_update_after_loop(
+        self,
+        state: List[Tensor],
+        mems_list: List[Tensor],
+        lc_key_list: List[Tensor],
+        lc_val_list: List[Tensor],
+        update_length: int,
+    ):
+        state[0] = torch.stack(mems_list, dim=0)
+        state[1] = torch.stack(lc_key_list, dim=0)
+        state[2] = torch.stack(lc_val_list, dim=0)
+        state[3] = state[3] + update_length
+        return state
+
+    @torch.jit.unused
+    def forward_mini_batches(
+        self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None
+    ) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]:
+        T, B, D = input.size()
+
+        # input without right context
+        seg = input[: T - self.right_context, :, :]
+
+        # get right context blocks
+        right_context_blocks = self._gen_right_context_padded_input(input)
+
+        mems_list = []
+        lc_key_list = []
+        lc_val_list = []
+        results = self.forward_jit_mini_batch_init(seg, state, False)
+        state, mems, state_mems, past_length, past_left_context = results
+
+        # relative position embedding
+        if self.use_rpe:
+            rpe = self._get_relative_position(
+                input=input,
+                max_relative_position=self.max_relative_position,
+                left_context_length=past_left_context,
+                past_length=past_length,
+                is_decoding=False,
+            )
+        else:
+            rpe = None
+
+        # get attention mask based on seg (not include right context) and available
+        # left context
+        attention_mask = self._get_attention_mask(seg, past_length, past_left_context)
+        mems_list.append(state_mems[-self.max_memory_size :, :, :])
+        output = seg
+        i = 0
+        all_outputs = []
+        for layer in self.layers:
+            # In order to make cross stream batching work, mem, left context key
+            # and left context value in the state should always be the same shape.
+            # We use the past length to track the processed segment number. In this
+            # way, we take out the essential memory, left context key and left
+            # context val from the state. After finish the forward for current segment
+            # we add the new memory, left context key and left context value into the
+            # staate and trim out the oldest part to keep the shape consistent.
+            pre_mems, lc_key, lc_val = self.state_update_before(
+                i, state, past_length, past_left_context
+            )
+
+            output, mems, right_context_blocks, next_key, next_val = layer.forward(
+                input=output,
+                lengths=lengths,
+                attention_mask=attention_mask,
+                mems=mems,
+                right_context_blocks=right_context_blocks,
+                pre_mems=pre_mems,
+                left_context_key=lc_key,
+                left_context_val=lc_val,
+                rpe=rpe,
+            )
+            all_outputs.append(output)
+            mems_list, lc_key_list, lc_val_list, mems = self.state_update_after(
+                layer=i,
+                state=state,
+                mems=mems,
+                next_key=next_key,
+                next_val=next_val,
+                mems_list=mems_list,
+                lc_key_list=lc_key_list,
+                lc_val_list=lc_val_list,
+            )
+
+            i += 1
+
+        # update state
+        update_length = math.ceil((T - self.right_context) / self.segment_size)
+        state = self.state_update_after_loop(
+            state=state,
+            mems_list=mems_list,
+            lc_key_list=lc_key_list,
+            lc_val_list=lc_val_list,
+            update_length=update_length,
+        )
+
+        return output, lengths, state, all_outputs
+
+    def forward_jit_test(
+        self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None
+    ) -> Tuple[Tensor, Tensor, List[Tensor]]:
+        """
+        This one simulate sequence encoder forward jit. This is for unit test purpose.
+        It is not used in training or decoding. Note, extra_right_context is set in
+        the model. In unit test, input = [utterance, right_context], lengths =
+        [utterance_length].
+        args:
+            input: input utterance
+            lengths: utterance input length
+            state: None here. input is whole utterance
+        """
+        # [TODO] sequence_to_segment has bug in lengths.
+        seg_src_tokens_lengths = self._gen_segs_right_context(input, lengths)
+
+        seg_enc_tokens_lengths: List[Tuple[Tensor, Tensor]] = []
+        state: Optional[List[Tensor]] = None
+        for seg_src_tokens, seg_src_lengths in seg_src_tokens_lengths:
+            seg_enc_tokens, seg_enc_lengths, state = self.forward_jit(
+                input=seg_src_tokens, lengths=seg_src_lengths, state=state
+            )
+            seg_enc_tokens_lengths.append((seg_enc_tokens, seg_enc_lengths))
+
+        enc_tokens, enc_lengths = segments_to_sequence(
+            segments=seg_enc_tokens_lengths, time_axis=0
+        )
+
+        state = []  # returns trivial state
+
+        return enc_tokens, enc_lengths, state
+
+    @torch.jit.export
+    def forward_jit(
+        self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None
+    ) -> Tuple[Tensor, Tensor, List[Tensor]]:
+        """
+        Forward helper for online decoding.
+
+        args:
+            input: [seg, right_context]. We assume in online we
+                always padding the right context to the preset right context size.
+                For the last segment, we may have short segment size, but right
+                context size is the same as other segments
+            lengths: utterance input length is the utterance segment length and
+                     right context size
+            state: [memory, left_context_key, left_context_val]. To improve throughput,
+                in addition to memory, we also cache key and value for left_context in
+                multihead self-attention
+        """
+        # In online decoding, input = [segment, right_context]
+        # Lengths = [segment_length, right_context_length]
+        # so we need strip right context in output
+        T, B, D = input.size()
+        rc_str = T - self.right_context
+        rc_end = T
+        right_context_blocks = input[rc_str:rc_end, :, :]
+        seg = input[:rc_str, :, :]
+        lengths = torch.clamp(lengths - self.right_context, min=0)
+        mems_list = []
+        lc_key_list = []
+        lc_val_list = []
+
+        results = self.forward_jit_mini_batch_init(seg, state, True)
+        state, mems, state_mems, past_length, past_left_context = results
+
+        # relative position embedding
+        if self.use_rpe:
+            rpe = self._get_relative_position(
+                input=input,
+                max_relative_position=self.max_relative_position,
+                left_context_length=past_left_context,
+                past_length=past_length,
+                is_decoding=True,
+            )
+        else:
+            rpe = None
+
+        # memory for first layer.
+        mems_list.append(state_mems[-self.max_memory_size :, :, :])
+        output = seg
+        i = 0
+        for layer in self.layers:
+            # In order to make cross stream batching work, mem, left context key
+            # and left context value in the state should always be the same shape.
+            # We use the past length to track the processed segment number. In this
+            # way, we take out the essential memory, left context key and left
+            # context val from the state. After finish the forward for current segment
+            # we add the new memory, left context key and left context value into the
+            # staate and trim out the oldest part to keep the shape consistent.
+            true_mems, lc_key, lc_val = self.state_update_before(
+                layer=i,
+                state=state,
+                past_length=past_length,
+                past_left_context=past_left_context,
+            )
+
+            output, mems, right_context_blocks, next_key, next_val = layer.forward_jit(
+                input=output,
+                lengths=lengths,
+                mems=true_mems,
+                right_context_blocks=right_context_blocks,
+                left_context_key=lc_key,
+                left_context_val=lc_val,
+                rpe=rpe,
+            )
+            # mems is used for next layer
+            mems_list, lc_key_list, lc_val_list, _ = self.state_update_after(
+                layer=i,
+                state=state,
+                mems_list=mems_list,
+                mems=mems,
+                next_key=next_key,
+                next_val=next_val,
+                lc_key_list=lc_key_list,
+                lc_val_list=lc_val_list,
+            )
+            i += 1
+
+        # update state
+        state = self.state_update_after_loop(
+            state=state,
+            mems_list=mems_list,
+            lc_key_list=lc_key_list,
+            lc_val_list=lc_val_list,
+            update_length=1,
+        )
+
+        return output, lengths, state
+
+    def quantize_(self, params=None):
+        if params and "per_channel" in params and params["per_channel"]:
+            qconfig = per_channel_dynamic_qconfig
+        else:
+            qconfig = default_dynamic_qconfig
+        quantization.quantize_dynamic(
+            self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True
+        )
+        return self
+
+
+# ------------------------------------------------------------------------------
+#   Emformer encoder for seq2seq model
+#   This is a wrapper over the original emformer
+# ------------------------------------------------------------------------------
+def emformer_encoder(klass):
+    class SpeechEncoder(klass):
+        def __init__(self, args):
+            super().__init__(args)
+            stride = SpeechEncoder.conv_layer_stride(args)
+            trf_left_context = args.segment_left_context // stride
+            trf_right_context = args.segment_right_context // stride
+            context_config = [trf_left_context, trf_right_context]
+            self.transformer_layers = nn.ModuleList(
+                [
+                    NoSegAugmentedMemoryTransformerEncoderLayer(
+                        input_dim=args.encoder_embed_dim,
+                        num_heads=args.encoder_attention_heads,
+                        ffn_dim=args.encoder_ffn_embed_dim,
+                        num_layers=args.encoder_layers,
+                        dropout_in_attn=args.dropout,
+                        dropout_on_attn=args.dropout,
+                        dropout_on_fc1=args.dropout,
+                        dropout_on_fc2=args.dropout,
+                        activation_fn=args.activation_fn,
+                        context_config=context_config,
+                        segment_size=args.segment_length,
+                        max_memory_size=args.max_memory_size,
+                        scaled_init=True,  # TODO: use constant for now.
+                        tanh_on_mem=args.amtrf_tanh_on_mem,
+                    )
+                ]
+            )
+
+        def forward(self, src_tokens, src_lengths):
+            encoder_out = super().forward(src_tokens, src_lengths)
+            output = encoder_out["encoder_out"][0]
+            encoder_padding_masks = encoder_out["encoder_padding_mask"][0]
+
+            # This is because that in the original implementation
+            # the output didn't consider the last segment as right context.
+            encoder_padding_masks = encoder_padding_masks[:, : output.size(0)]
+
+            return {
+                "encoder_out": [output],
+                "encoder_padding_mask": [encoder_padding_masks],
+                "encoder_embedding": [],
+                "encoder_states": [],
+                "src_tokens": [],
+                "src_lengths": [],
+            }
+
+        @staticmethod
+        def conv_layer_stride(args):
+            # TODO: make it configurable from the args
+            return 4
+
+    SpeechEncoder.__name__ = klass.__name__
+    return SpeechEncoder
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
new file mode 100644
index 0000000000..046421620a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.models import FairseqDecoder, FairseqEncoder
+
+
+# a container for different encoders with training samples from  different modality
+# each time, only one encoder is selected
+class MultiModalityEncoder(FairseqEncoder):
+    def __init__(self, dictionary):
+        super().__init__(dictionary)
+
+    def select_encoder(self, mode, **kwargs):
+        raise NotImplementedError("Model must implement the select_encoder method")
+        return None, kwargs
+
+    # def post_encoder(self, encoder_out, src_tokens, src_lengths, mode, **kwargs):
+    #    # Default do nothing
+    #    return encoder_out
+
+    # get sample data from JointSpeechTextDataset
+    def forward(self, src_tokens, src_lengths=None, mode="", **kwargs):
+        encoder, kwargs = self.select_encoder(mode, **kwargs)
+        # return self.post_encoder(encoder(src_tokens, src_lengths, **kwargs), src_tokens, src_lengths, mode, **kwargs)
+        return encoder(src_tokens, src_lengths, **kwargs)
+
+
+# a container for different decoders with training samples from  different modality
+# each time, only one decoder is selected
+class MultiInputDecoder(FairseqDecoder):
+    def __init__(self, dictionary):
+        super().__init__(dictionary)
+
+    def select_decoder(self, mode, **kwargs):
+        raise NotImplementedError("Model must implement the select_decoder method")
+        return None, kwargs
+
+    def forward(
+        self, prev_output_tokens, encoder_out, incremental_state=None, mode="", **kwargs
+    ):
+        decoder, kwargs = self.select_decoder(mode, **kwargs)
+        return decoder(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state=incremental_state,
+            **kwargs
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
new file mode 100644
index 0000000000..79dbbec1b9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
@@ -0,0 +1,234 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from pathlib import Path
+
+import torch
+
+from fairseq import checkpoint_utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import FairseqEncoder, register_model, register_model_architecture
+from fairseq.models.speech_to_text.modules.convolution import (
+    Conv1dSubsampler,
+    Conv2dSubsampler,
+)
+from fairseq.models.speech_to_text.s2t_transformer import (
+    S2TTransformerEncoder,
+    S2TTransformerModel,
+)
+from fairseq.models.speech_to_text.s2t_transformer import (
+    base_architecture as transformer_base_architecture,
+)
+from fairseq.modules import PositionalEmbedding, RelPositionalEncoding
+from fairseq.modules.conformer_layer import ConformerEncoderLayer
+
+logger = logging.getLogger(__name__)
+
+
+class S2TConformerEncoder(FairseqEncoder):
+    """Conformer Encoder for speech translation based on https://arxiv.org/abs/2005.08100"""
+
+    def __init__(self, args):
+        super().__init__(None)
+
+        self.encoder_freezing_updates = args.encoder_freezing_updates
+        self.num_updates = 0
+
+        self.embed_scale = math.sqrt(args.encoder_embed_dim)
+        if args.no_scale_embedding:
+            self.embed_scale = 1.0
+        self.padding_idx = 1
+        self.conv_version = args.conv_version
+        if self.conv_version == "s2t_transformer":
+            self.subsample = Conv1dSubsampler(
+                args.input_feat_per_channel * args.input_channels,
+                args.conv_channels,
+                args.encoder_embed_dim,
+                [int(k) for k in args.conv_kernel_sizes.split(",")],
+            )
+        elif self.conv_version == "convtransformer":
+            self.subsample = Conv2dSubsampler(
+                args.input_channels,
+                args.input_feat_per_channel,
+                args.conv_out_channels,
+                args.encoder_embed_dim,
+            )
+        self.pos_enc_type = args.pos_enc_type
+        if self.pos_enc_type == "rel_pos":
+            self.embed_positions = RelPositionalEncoding(
+                args.max_source_positions, args.encoder_embed_dim
+            )
+        elif self.pos_enc_type == "rope":
+            self.embed_positions = None
+        else:  # Use absolute positional embedding
+            self.pos_enc_type = "abs"
+            self.embed_positions = PositionalEmbedding(
+                args.max_source_positions, args.encoder_embed_dim, self.padding_idx
+            )
+
+        self.linear = torch.nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim)
+        self.dropout = torch.nn.Dropout(args.dropout)
+        self.conformer_layers = torch.nn.ModuleList(
+            [
+                ConformerEncoderLayer(
+                    embed_dim=args.encoder_embed_dim,
+                    ffn_embed_dim=args.encoder_ffn_embed_dim,
+                    attention_heads=args.encoder_attention_heads,
+                    dropout=args.dropout,
+                    depthwise_conv_kernel_size=args.depthwise_conv_kernel_size,
+                    attn_type=args.attn_type,
+                    pos_enc_type=self.pos_enc_type,
+                    use_fp16=args.fp16,
+                )
+                for _ in range(args.encoder_layers)
+            ]
+        )
+
+    def _forward(self, src_tokens, src_lengths, return_all_hiddens=False):
+        """
+        Args:
+            src_tokens: Input source tokens Tensor of shape B X T X C
+            src_lengths: Lengths Tensor corresponding to input source tokens
+            return_all_hiddens: If true will append the self attention states to the encoder states
+        Returns:
+            encoder_out: Tensor of shape B X T X C
+            encoder_padding_mask: Optional Tensor with mask
+            encoder_embedding: Optional Tensor. Always empty here
+            encoder_states: List of Optional Tensors wih self attention states
+            src_tokens: Optional Tensor. Always empty here
+            src_lengths: Optional Tensor. Always empty here
+        """
+        x, input_lengths = self.subsample(src_tokens, src_lengths)  # returns T X B X C
+        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
+        x = self.embed_scale * x
+        if self.pos_enc_type == "rel_pos":
+            positions = self.embed_positions(x)
+
+        elif self.pos_enc_type == "rope":
+            positions = None
+
+        else:
+            positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
+            x += positions
+            positions = None
+
+        x = self.linear(x)
+        x = self.dropout(x)
+        encoder_states = []
+
+        # x is T X B X C
+        for layer in self.conformer_layers:
+            x, _ = layer(x, encoder_padding_mask, positions)
+            if return_all_hiddens:
+                encoder_states.append(x)
+
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask]
+            if encoder_padding_mask.any()
+            else [],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+    def forward(self, src_tokens, src_lengths, return_all_hiddens=False):
+        if self.num_updates < self.encoder_freezing_updates:
+            with torch.no_grad():
+                x = self._forward(
+                    src_tokens,
+                    src_lengths,
+                    return_all_hiddens=return_all_hiddens,
+                )
+        else:
+            x = self._forward(
+                src_tokens,
+                src_lengths,
+                return_all_hiddens=return_all_hiddens,
+            )
+        return x
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """Required method for a FairseqEncoder. Calls the method from the parent class"""
+        return S2TTransformerEncoder.reorder_encoder_out(self, encoder_out, new_order)
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+
+
+@register_model("s2t_conformer")
+class S2TConformerModel(S2TTransformerModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        S2TTransformerModel.add_args(parser)
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="dimension of input features per channel",
+        )
+        parser.add_argument(
+            "--input-channels",
+            type=int,
+            metavar="N",
+            help="number of chennels of input features",
+        )
+        parser.add_argument(
+            "--depthwise-conv-kernel-size",
+            type=int,
+            metavar="N",
+            help="kernel size of depthwise convolution layers",
+        )
+        parser.add_argument(
+            "--attn-type",
+            type=str,
+            metavar="STR",
+            help="If not specified uses fairseq MHA. Other valid option is espnet",
+        )
+        parser.add_argument(
+            "--pos-enc-type",
+            type=str,
+            metavar="STR",
+            help="Must be specified in addition to attn-type=espnet for rel_pos and rope",
+        )
+
+    @classmethod
+    def build_encoder(cls, args):
+        encoder = S2TConformerEncoder(args)
+        pretraining_path = getattr(args, "load_pretrained_encoder_from", None)
+        if pretraining_path is not None:
+            if not Path(pretraining_path).exists():
+                logger.warning(
+                    f"skipped pretraining because {pretraining_path} does not exist"
+                )
+            else:
+                encoder = checkpoint_utils.load_pretrained_component_from_model(
+                    component=encoder, checkpoint=pretraining_path
+                )
+                logger.info(f"loaded pretrained encoder from: {pretraining_path}")
+        return encoder
+
+
+@register_model_architecture("s2t_conformer", "s2t_conformer")
+def conformer_base_architecture(args):
+    args.attn_type = getattr(args, "attn_type", None)
+    args.pos_enc_type = getattr(args, "pos_enc_type", "abs")
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.input_channels = getattr(args, "input_channels", 1)
+    args.max_source_positions = getattr(args, "max_source_positions", 6000)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31)
+    transformer_base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
new file mode 100644
index 0000000000..50fae2ffa2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
@@ -0,0 +1,552 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import checkpoint_utils, utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_text.hub_interface import S2THubInterface
+from fairseq.models.speech_to_text.modules.convolution import (
+    Conv1dSubsampler,
+    Conv2dSubsampler,
+)
+from fairseq.models.transformer import Embedding, TransformerDecoder
+from fairseq.modules import (
+    FairseqDropout,
+    LayerNorm,
+    PositionalEmbedding,
+    TransformerEncoderLayer,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("s2t_transformer")
+class S2TTransformerModel(FairseqEncoderDecoderModel):
+    """Adapted Transformer model (https://arxiv.org/abs/1706.03762) for
+    speech-to-text tasks. The Transformer encoder/decoder remains the same.
+    A trainable input subsampler is prepended to the Transformer encoder to
+    project inputs into the encoder dimension as well as downsample input
+    sequence for computational efficiency."""
+
+    @classmethod
+    def hub_models(cls):
+        base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t"
+        model_ids = [
+            "s2t_transformer_s-en-asr-librispeech",
+            "s2t_transformer_m-en-asr-librispeech",
+            "s2t_transformer_l-en-asr-librispeech",
+        ]
+        return {i: f"{base_url}/{i}.tar.gz" for i in model_ids}
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        config_yaml="config.yaml",
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            config_yaml=config_yaml,
+            **kwargs,
+        )
+        return S2THubInterface(x["args"], x["task"], x["models"][0])
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # input
+        parser.add_argument(
+            "--conv-kernel-sizes",
+            type=str,
+            metavar="STR",
+            help="kernel sizes of Conv1d (s2t_transformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-channels",
+            type=int,
+            metavar="N",
+            help="# of channels in Conv1d (s2t_transformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-out-channels",
+            type=int,
+            metavar="N",
+            help="# of channels in Conv2d (convtransformer) subsampling layers",
+        )
+        parser.add_argument(
+            "--conv-version",
+            type=str,
+            default="s2t_transformer",
+            choices=["s2t_transformer", "convtransformer"],
+            help="version of frontend convolutional layers",
+        )
+        # Transformer
+        parser.add_argument(
+            "--activation-fn",
+            type=str,
+            default="relu",
+            choices=utils.get_available_activation_fns(),
+            help="activation function to use",
+        )
+        parser.add_argument(
+            "--dropout", type=float, metavar="D", help="dropout probability"
+        )
+        parser.add_argument(
+            "--attention-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability for attention weights",
+        )
+        parser.add_argument(
+            "--activation-dropout",
+            "--relu-dropout",
+            type=float,
+            metavar="D",
+            help="dropout probability after activation in FFN.",
+        )
+        parser.add_argument(
+            "--encoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension",
+        )
+        parser.add_argument(
+            "--encoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="encoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+        )
+        parser.add_argument(
+            "--encoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num encoder attention heads",
+        )
+        parser.add_argument(
+            "--encoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each encoder block",
+        )
+        parser.add_argument(
+            "--decoder-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension",
+        )
+        parser.add_argument(
+            "--decoder-ffn-embed-dim",
+            type=int,
+            metavar="N",
+            help="decoder embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+        )
+        parser.add_argument(
+            "--decoder-attention-heads",
+            type=int,
+            metavar="N",
+            help="num decoder attention heads",
+        )
+        parser.add_argument(
+            "--decoder-normalize-before",
+            action="store_true",
+            help="apply layernorm before each decoder block",
+        )
+        parser.add_argument(
+            "--share-decoder-input-output-embed",
+            action="store_true",
+            help="share decoder input and output embeddings",
+        )
+        parser.add_argument(
+            "--layernorm-embedding",
+            action="store_true",
+            help="add layernorm to embedding",
+        )
+        parser.add_argument(
+            "--no-scale-embedding",
+            action="store_true",
+            help="if True, dont scale embeddings",
+        )
+        parser.add_argument(
+            "--load-pretrained-encoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take encoder weights from (for initialization)",
+        )
+        parser.add_argument(
+            "--encoder-freezing-updates",
+            type=int,
+            metavar="N",
+            help="freeze encoder for first N updates",
+        )
+
+    @classmethod
+    def build_encoder(cls, args):
+        encoder = S2TTransformerEncoder(args)
+        pretraining_path = getattr(args, "load_pretrained_encoder_from", None)
+        if pretraining_path is not None:
+            if not Path(pretraining_path).exists():
+                logger.warning(
+                    f"skipped pretraining because {pretraining_path} does not exist"
+                )
+            else:
+                encoder = checkpoint_utils.load_pretrained_component_from_model(
+                    component=encoder, checkpoint=pretraining_path
+                )
+                logger.info(f"loaded pretrained encoder from: {pretraining_path}")
+        return encoder
+
+    @classmethod
+    def build_decoder(cls, args, task, embed_tokens):
+        return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens)
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        def build_embedding(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            return Embedding(num_embeddings, embed_dim, padding_idx)
+
+        decoder_embed_tokens = build_embedding(
+            task.target_dictionary, args.decoder_embed_dim
+        )
+        args.tgt_dict_size = len(task.target_dictionary)
+        encoder = cls.build_encoder(args)
+        decoder = cls.build_decoder(args, task, decoder_embed_tokens)
+        return cls(encoder, decoder)
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+        lprobs.batch_first = True
+        return lprobs
+
+    def get_ctc_target(self, sample: Optional[Dict[str, Tensor]]):
+        return sample["target"], sample["target_lengths"]
+
+    def get_ctc_output(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        sample: Optional[Dict[str, Tensor]],
+    ):
+        encoder_out = net_output[1]["encoder_out"]["encoder_out"][0]
+        logits = self.encoder.ctc_proj(encoder_out)  # T x B x C
+        out = utils.log_softmax(logits.float(), dim=-1)
+        padding_mask = net_output[1]["encoder_out"]["encoder_padding_mask"]
+        lens = out.new_full((out.shape[1],), out.shape[0]).long()
+        if len(padding_mask) > 0:
+            lens -= padding_mask[0].sum(dim=-1)
+        return out, lens
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        """
+        The forward method inherited from the base class has a **kwargs
+        argument in its input, which is not supported in torchscript. This
+        method overwrites the forward method definition without **kwargs.
+        """
+        encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths)
+        decoder_out = self.decoder(
+            prev_output_tokens=prev_output_tokens, encoder_out=encoder_out
+        )
+        return decoder_out
+
+
+class S2TTransformerEncoder(FairseqEncoder):
+    """Speech-to-text Transformer encoder that consists of input subsampler and
+    Transformer encoder."""
+
+    def __init__(self, args):
+        super().__init__(None)
+
+        self.encoder_freezing_updates = args.encoder_freezing_updates
+        self.num_updates = 0
+
+        self.dropout_module = FairseqDropout(
+            p=args.dropout, module_name=self.__class__.__name__
+        )
+        self.embed_scale = math.sqrt(args.encoder_embed_dim)
+        if args.no_scale_embedding:
+            self.embed_scale = 1.0
+        self.padding_idx = 1
+
+        self.conv_version = args.conv_version
+        if self.conv_version == "s2t_transformer":
+            self.subsample = Conv1dSubsampler(
+                args.input_feat_per_channel * args.input_channels,
+                args.conv_channels,
+                args.encoder_embed_dim,
+                [int(k) for k in args.conv_kernel_sizes.split(",")],
+            )
+        elif self.conv_version == "convtransformer":
+            self.subsample = Conv2dSubsampler(
+                args.input_channels,
+                args.input_feat_per_channel,
+                args.conv_out_channels,
+                args.encoder_embed_dim,
+            )
+
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions, args.encoder_embed_dim, self.padding_idx
+        )
+
+        self.transformer_layers = nn.ModuleList(
+            [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)]
+        )
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(args.encoder_embed_dim)
+        else:
+            self.layer_norm = None
+
+        self.ctc_proj = None
+        if getattr(args, "ctc_weight", 0.0) > 0.0:
+            self.ctc_proj = nn.Linear(args.encoder_embed_dim, args.tgt_dict_size)
+
+    def _forward(self, src_tokens, src_lengths, return_all_hiddens=False):
+        x, input_lengths = self.subsample(src_tokens, src_lengths)
+        x = self.embed_scale * x
+
+        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
+        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
+        x += positions
+        x = self.dropout_module(x)
+
+        encoder_states = []
+
+        for layer in self.transformer_layers:
+            x = layer(x, encoder_padding_mask)
+            if return_all_hiddens:
+                encoder_states.append(x)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask]
+            if encoder_padding_mask.any()
+            else [],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+    def forward(self, src_tokens, src_lengths, return_all_hiddens=False):
+        if self.num_updates < self.encoder_freezing_updates:
+            with torch.no_grad():
+                x = self._forward(
+                    src_tokens, src_lengths, return_all_hiddens=return_all_hiddens
+                )
+        else:
+            x = self._forward(
+                src_tokens, src_lengths, return_all_hiddens=return_all_hiddens
+            )
+        return x
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        new_encoder_out = (
+            []
+            if len(encoder_out["encoder_out"]) == 0
+            else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
+        )
+
+        new_encoder_padding_mask = (
+            []
+            if len(encoder_out["encoder_padding_mask"]) == 0
+            else [
+                x.index_select(0, new_order)
+                for x in encoder_out["encoder_padding_mask"]
+            ]
+        )
+
+        new_encoder_embedding = (
+            []
+            if len(encoder_out["encoder_embedding"]) == 0
+            else [
+                x.index_select(0, new_order) for x in encoder_out["encoder_embedding"]
+            ]
+        )
+
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],  # B x T
+            "src_lengths": [],  # B x 1
+        }
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+
+
+class TransformerDecoderScriptable(TransformerDecoder):
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        # call scriptable method from parent class
+        x, _ = self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+        extra = {"encoder_out": encoder_out} if incremental_state is None else None
+        return x, extra
+
+
+@register_model_architecture(model_name="s2t_transformer", arch_name="s2t_transformer")
+def base_architecture(args):
+    args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0)
+    # Convolutional subsampler
+    args.input_channels = getattr(args, "input_channels", 1)
+    args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5")  # for Conv1d
+    args.conv_channels = getattr(args, "conv_channels", 1024)  # for Conv1d
+    args.conv_out_channels = getattr(args, "conv_out_channels", 256)  # for Conv2d
+    args.conv_version = getattr(args, "conv_version", "s2t_transformer")
+    # Transformer
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
+    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_s")
+def s2t_transformer_s(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    base_architecture(args)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_xs")
+def s2t_transformer_xs(args):
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.decoder_layers = getattr(args, "decoder_layers", 3)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 4)
+    args.dropout = getattr(args, "dropout", 0.3)
+    s2t_transformer_s(args)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_sp")
+def s2t_transformer_sp(args):
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    s2t_transformer_s(args)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_m")
+def s2t_transformer_m(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512 * 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.dropout = getattr(args, "dropout", 0.15)
+    base_architecture(args)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_mp")
+def s2t_transformer_mp(args):
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    s2t_transformer_m(args)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_l")
+def s2t_transformer_l(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024 * 4)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.2)
+    base_architecture(args)
+
+
+@register_model_architecture("s2t_transformer", "s2t_transformer_lp")
+def s2t_transformer_lp(args):
+    args.encoder_layers = getattr(args, "encoder_layers", 16)
+    s2t_transformer_l(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py
new file mode 100644
index 0000000000..ad21aeeb1a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python3
+
+import math
+
+import torch
+import torch.nn as nn
+
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import FairseqEncoder
+from fairseq.models.wav2vec import ConvFeatureExtractionModel
+from fairseq.modules import GradMultiply, LayerNorm, SamePad, TransformerEncoderLayer
+
+
+#   Transformer encoder with wave input, it is adopted from wav2vec 2.0 Encoder.
+#       use wav input
+#       use trained position embedding so it is easier to match with text input
+class SpeechWavTransformerEncoder(FairseqEncoder):
+
+    # extra parameters for speech encoder besides those defined in transformermodel
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument(
+            "--dropout-input",
+            type=float,
+            metavar="D",
+            help="dropout to apply to the input (after feat extr)",
+        )
+        parser.add_argument(
+            "--dropout-features",
+            type=float,
+            metavar="D",
+            help="dropout to apply to the unmasked features (after feat extr)",
+        )
+        parser.add_argument(
+            "--speech-extractor-mode",
+            type=str,
+            default="layer_norm",
+            choices=["default", "layer_norm"],
+            help="feature extractor norm",
+        )
+
+        parser.add_argument(
+            "--speech-conv-bias",
+            action="store_true",
+            help="include bias in speech conv encoder",
+        )
+
+        parser.add_argument(
+            "--conv-feature-layers",
+            default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+            help="string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]",
+        )
+
+        parser.add_argument(
+            "--speech-mask-length",
+            type=int,
+            help="repeat the mask indices multiple times",
+        )
+
+        parser.add_argument(
+            "--speech-mask-prob",
+            type=float,
+            help="probability of replacing a token with mask",
+        )
+
+        parser.add_argument(
+            "--speech-mask-selection",
+            type=str,
+            choices=["static", "uniform", "normal", "poisson"],
+            help="how to choose masks",
+        )
+
+        parser.add_argument(
+            "--speech-mask-other",
+            type=float,
+            help="stdev of the mask length in case of 'normal' selection strategy",
+        )
+
+        parser.add_argument(
+            "--speech-no-mask-overlap",
+            action="store_true",
+            help="whether to allow masks to overlap",
+        )
+
+        parser.add_argument(
+            "--speech-mask-min-space",
+            type=int,
+            help="min space between spans (if no overlap is enabled)",
+        )
+
+        parser.add_argument(
+            "--speech-mask-channel-length",
+            type=int,
+            help="repeat the mask indices multiple times",
+        )
+
+        parser.add_argument(
+            "--speech-mask-channel-prob",
+            type=float,
+            help="probability of replacing a token with mask",
+        )
+
+        parser.add_argument(
+            "--speech-mask-channel-selection",
+            type=str,
+            choices=["static", "uniform", "normal", "poisson"],
+            help="how to choose masks",
+        )
+
+        parser.add_argument(
+            "--speech-mask-channel-other",
+            type=float,
+            help="stdev of the mask length in case of 'normal' selection strategy",
+        )
+
+        parser.add_argument(
+            "--speech-no-mask-channel-overlap",
+            action="store_true",
+            help="whether to allow masks to overlap",
+        )
+
+        parser.add_argument(
+            "--no-scale-feature",
+            action="store_true",
+            help="no scale for the calculated features",
+        )
+
+        parser.add_argument(
+            "--speech-mask-channel-min-space",
+            type=int,
+            help="min space between spans (if no overlap is enabled)",
+        )
+
+        parser.add_argument(
+            "--feature-grad-mult",
+            type=float,
+            help="reset feature grad mult in wav2vec 2.0 to this",
+        )
+
+        # positional embeddings
+        parser.add_argument(
+            "--conv-pos",
+            type=int,
+            default=128,
+            help="number of filters for convolutional positional embeddings",
+        )
+
+        parser.add_argument(
+            "--conv-pos-groups",
+            type=int,
+            default=16,
+            help="number of groups for convolutional positional embedding",
+        )
+        # model configures
+        parser.add_argument(
+            "--speech-encoder-layers",
+            type=int,
+            help="number of speech encoder layers",
+        )
+        parser.add_argument(
+            "--text-encoder-layers",
+            type=int,
+            help="number of text encoder layers",
+        )
+
+    def __init__(self, args, alway_mask=False):
+        super().__init__(args)
+        self.args = args
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.feat_scale = math.sqrt(args.encoder_embed_dim)
+        if args.no_scale_feature:
+            self.feat_scale = 1.0
+
+        subsample = ConvFeatureExtractionModel(
+            conv_layers=eval(args.conv_feature_layers),
+            dropout=0.0,
+            mode=args.speech_extractor_mode,  # default, layer_norm
+            conv_bias=args.speech_conv_bias,
+        )
+        self.feature_enc_layers = eval(args.conv_feature_layers)
+        self.subsample = subsample
+        self.feat_proj = (
+            nn.Linear(self.feature_enc_layers[-1][0], self.embedding_dim)
+            if self.feature_enc_layers[-1][0] != self.embedding_dim
+            else None
+        )
+
+        self.feat_layer_norm = LayerNorm(self.feature_enc_layers[-1][0])
+
+        self.embed_positions = nn.Conv1d(
+            self.embedding_dim,
+            self.embedding_dim,
+            kernel_size=args.conv_pos,
+            padding=args.conv_pos // 2,
+            groups=args.conv_pos_groups,
+        )
+        std = math.sqrt(4 / (args.conv_pos * self.embedding_dim))
+        nn.init.normal_(self.embed_positions.weight, mean=0, std=std)
+        nn.init.constant_(self.embed_positions.bias, 0)
+
+        self.embed_positions = nn.utils.weight_norm(
+            self.embed_positions, name="weight", dim=2
+        )
+        self.embed_positions = nn.Sequential(
+            self.embed_positions, SamePad(args.conv_pos), nn.GELU()
+        )
+
+        self.mask_prob = args.speech_mask_prob
+        self.mask_selection = args.speech_mask_selection
+        self.mask_other = args.speech_mask_other
+        self.mask_length = args.speech_mask_length
+        self.no_mask_overlap = args.speech_no_mask_overlap
+        self.mask_min_space = args.speech_mask_min_space
+
+        self.mask_channel_prob = args.speech_mask_channel_prob
+        self.mask_channel_selection = args.speech_mask_channel_selection
+        self.mask_channel_other = args.speech_mask_channel_other
+        self.mask_channel_length = args.speech_mask_channel_length
+        self.no_mask_channel_overlap = args.speech_no_mask_channel_overlap
+        self.mask_channel_min_space = args.speech_mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(args.dropout_input)
+        self.dropout_features = nn.Dropout(args.dropout_features)
+
+        self.feature_grad_mult = args.feature_grad_mult
+
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(args.encoder_embed_dim).uniform_()
+        )
+
+        self.layers = nn.ModuleList(
+            [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)]
+        )
+        self.layer_norm = LayerNorm(args.encoder_embed_dim)
+        self.normalize_before = args.encoder_normalize_before
+        self.alway_mask = alway_mask
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+
+        for i in range(len(self.feature_enc_layers)):
+            input_lengths = _conv_out_length(
+                input_lengths,
+                self.feature_enc_layers[i][1],
+                self.feature_enc_layers[i][2],
+            )
+
+        return input_lengths.to(torch.long)
+
+    def apply_mask(self, x, padding_mask):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x[mask_indices] = self.mask_emb
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        return_all_hiddens=False,
+        padding_mask=None,
+        features_only=True,
+    ):
+        mask = self.training or self.alway_mask
+        if self.feature_grad_mult > 0 and self.training:
+            features = self.subsample(src_tokens)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.subsample(src_tokens)
+        features = features.transpose(1, 2)
+        features = self.feat_layer_norm(features)
+        if self.feat_proj is not None:
+            features = self.feat_proj(features)
+
+        if padding_mask is not None:
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+        else:
+            input_lengths = src_lengths
+        # apply conv formula to get real output_lengths
+        output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+
+        padding_mask = torch.zeros(
+            features.shape[:2], dtype=features.dtype, device=features.device
+        )
+
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        padding_mask[
+            (
+                torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                output_lengths - 1,
+            )
+        ] = 1
+        padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+
+        features = self.feat_scale * features if self.feat_scale != 1.0 else features
+        unmasked_features = features.clone()
+
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+        if mask:
+            x, mask_indices = self.apply_mask(features, padding_mask)
+        else:
+            x = features
+            mask_indices = None
+
+        def cal_transformer_layers(x, encoder_padding_mask, return_all_hiddens=False):
+            # x: B x T x C
+            positions = self.embed_positions(x.transpose(1, 2)).transpose(1, 2)
+            x = x + positions
+            if not self.normalize_before:
+                x = self.layer_norm(x)
+
+            # B x T x C -> T x B x C
+            x = x.transpose(0, 1)
+            encoder_states = []
+            for layer in self.layers:
+                x = layer(x, encoder_padding_mask)
+                if return_all_hiddens:
+                    encoder_states.append(x)
+            if self.normalize_before:
+                x = self.layer_norm(x)
+            return x, encoder_states
+
+        x, encoder_states = cal_transformer_layers(x, padding_mask, return_all_hiddens)
+        if features_only:
+            return {
+                "encoder_out": [x],  # [T x B x C]
+                "encoder_padding_mask": [padding_mask]
+                if padding_mask is not None
+                else [],  # B x T
+                "encoder_embedding": [],  #
+                "encoder_states": encoder_states,  # List[T x B x C]
+                "src_tokens": [],
+                "src_lengths": [],
+                "mask_indices": [mask_indices],
+            }
+
+        x_unmasked = x
+        if self.mask_prob > 0 or self.mask_channel_prob > 0:
+            x_unmasked, _ = cal_transformer_layers(unmasked_features, padding_mask)
+        return {
+            "encoder_out": [x],  # [T x B x C]
+            "encoder_unmasked_out": [x_unmasked],  # [T x B x C]
+            "encoder_padding_mask": [padding_mask]
+            if padding_mask is not None
+            else [],  # B x T
+            "encoder_embedding": [],  #
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+            "mask_indices": [mask_indices] if mask_indices is not None else [],  # B X T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        new_encoder_out = (
+            []
+            if len(encoder_out["encoder_out"]) == 0
+            else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
+        )
+
+        new_encoder_padding_mask = (
+            []
+            if len(encoder_out["encoder_padding_mask"]) == 0
+            else [
+                x.index_select(0, new_order)
+                for x in encoder_out["encoder_padding_mask"]
+            ]
+        )
+
+        new_encoder_embedding = (
+            []
+            if len(encoder_out["encoder_embedding"]) == 0
+            else [
+                x.index_select(0, new_order) for x in encoder_out["encoder_embedding"]
+            ]
+        )
+
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],  # B x T
+            "src_lengths": [],  # B x 1
+        }
+
+
+class StackedSpeechWavTransformerEncoder(FairseqEncoder):
+    def __init__(self, speech_enc, text_enc_layers, text_layer_norm):
+        super().__init__(None)
+        self.speech_encoder = speech_enc
+        self.text_encoder_layers = text_enc_layers
+        self.final_layer_norm = text_layer_norm
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths=None,
+        return_all_hiddens=False,
+        padding_mask=None,
+        features_only=True,
+    ):
+
+        out = self.speech_encoder.forward(
+            src_tokens,
+            src_lengths,
+            return_all_hiddens,
+            padding_mask=padding_mask,
+            features_only=features_only,
+        )
+        x = out["encoder_out"][0]
+        encoder_padding_mask = None
+        if len(out["encoder_padding_mask"]) > 0:
+            encoder_padding_mask = out["encoder_padding_mask"][0]
+
+        def cal_text_layers(x, padding_mask, return_all_hiddens=False):
+            encoder_states = []
+            for layer in self.text_encoder_layers:
+                x = layer(x, padding_mask)
+                if return_all_hiddens:
+                    encoder_states.append(x)
+            if self.final_layer_norm is not None:
+                x = self.final_layer_norm(x)
+            return x, encoder_states
+
+        x, encoder_states = cal_text_layers(x, encoder_padding_mask, return_all_hiddens)
+        if features_only:
+            return {
+                "encoder_out": [x],  # T x B x C
+                "encoder_padding_mask": [encoder_padding_mask]
+                if encoder_padding_mask is not None
+                else [],  # B x T
+                "encoder_embedding": [],  # B x T x C
+                "encoder_states": encoder_states,  # List[T x B x C]
+                "src_tokens": [],
+                "src_lengths": [],
+            }
+
+        x_u = out["encoder_unmasked_out"][0]
+        x_u, _ = cal_text_layers(x_u, encoder_padding_mask)
+
+        return {
+            "encoder_out": [x],  # [T x B x C]
+            "encoder_unmasked_out": [x_u],  # [T x B x C]
+            "encoder_padding_mask": [encoder_padding_mask]
+            if encoder_padding_mask is not None
+            else [],  # B x T
+            "encoder_embedding": [],  #
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+            "mask_indices": out["mask_indices"],  # B X T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return self.speech_encoder.reorder_encoder_out(encoder_out, new_order)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py
new file mode 100644
index 0000000000..33117446a5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py
@@ -0,0 +1,562 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+import logging
+from collections.abc import Iterable
+from itertools import repeat
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+# ------------------------------------------------------------------------------
+#   assert_equal()
+# ------------------------------------------------------------------------------
+
+
+def assert_equal(value1, value2, name1=None, name2=None):
+    """Asserts two values are equal otherwise raise an error."""
+
+    str_name1 = "" if name1 is None else "{} ".format(name1)
+    str_name2 = "" if name2 is None else "{} ".format(name2)
+    if value1 != value2:
+        str_value1 = "{}" if name1 is None else "({})"
+        str_value1 = str_value1.format(value1)
+        str_value2 = "{}" if name2 is None else "({})"
+        str_value2 = str_value2.format(value2)
+        raise ValueError(
+            "Expected {}{} == {}{}".format(str_name1, str_value1, str_name2, str_value2)
+        )
+
+
+def fill_config(config, key, value):
+    if value is not None:
+        if key not in config or config[key] is None:
+            config[key] = value
+        assert_equal(value, config[key], "value", f'config["{key}"]')
+
+
+# ------------------------------------------------------------------------------
+#   check_and_return_expected()
+# ------------------------------------------------------------------------------
+
+
+def check_and_return_expected(value, undefined_value, expected_value, name=None):
+    """
+    Return the expected value while checking if the given value is undefined or
+    equal to the expected value.
+    """
+    if (undefined_value is None and value is None) or (undefined_value == value):
+        return expected_value
+    if value != expected_value:
+        str_name = "" if name is None else "{} ".format(name)
+        str_value = "{}" if name is None else "({})"
+        str_value = str_value.format(value)
+        raise ValueError(
+            "Expected {}{} == {}".format(str_name, str_value, expected_value)
+        )
+    return expected_value
+
+
+# ------------------------------------------------------------------------------
+#   get_time_axis()
+# ------------------------------------------------------------------------------
+
+
+def get_time_axis(layout):
+    """
+    Extract the time axis from the layout, for example for breaking sequence into
+    segments.
+    """
+    if layout in ["TB", "TBD"]:
+        return 0
+    if layout in ["BT", "BTD"]:
+        return 1
+    if layout in ["BCTD"]:
+        return 2
+    raise ValueError("Unsupported layout = {}".format(layout))
+
+
+# ------------------------------------------------------------------------------
+#   get_batch_axis()
+# ------------------------------------------------------------------------------
+
+
+def get_batch_axis(layout):
+    """
+    Extract the batch axis from the layout
+    """
+    if layout in ["TB", "TBD"]:
+        return 1
+    if layout in ["BT", "BTD", "BCTD"]:
+        return 0
+    raise ValueError("Unsupported layout = {}".format(layout))
+
+
+# ------------------------------------------------------------------------------
+#   monotonically_increasing_and_bounded()
+# ------------------------------------------------------------------------------
+
+
+def monotonically_increasing_and_bounded(iterable, min=None, max=None):
+    """
+    Check if the elements in the given iterable are monotonically increasing and
+    bounded by upper/lower bounds.
+    """
+    if not isinstance(iterable, Iterable):
+        raise TypeError(
+            "Expected iterable to be of type Iterable, got ({})".format(
+                iterable.__class__.__name__
+            )
+        )
+    for i in range(len(iterable)):
+        if min is not None and iterable[i] < min:
+            return False
+        if max is not None and iterable[i] > max:
+            return False
+        if i > 0 and iterable[i] <= iterable[i - 1]:
+            return False
+    return True
+
+
+# ------------------------------------------------------------------------------
+#   to_pair()
+# ------------------------------------------------------------------------------
+
+
+def to_pair(value, name):
+    """Make a pair (of type tuple) of given value."""
+    if isinstance(value, Iterable):
+        if len(value) != 2:
+            raise ValueError(
+                "Expected `{}` to have exactly 2 elements, got: ({})".format(
+                    name, value
+                )
+            )
+        return value
+    return tuple(repeat(value, 2))
+
+
+# ------------------------------------------------------------------------------
+#   infer_conv_output_attrs()
+# ------------------------------------------------------------------------------
+
+
+# TODO(cfyeh): figure out if we can get `output_dim` without calling the module.
+def infer_conv_output_attrs(
+    module, input_channels, input_dim, batch_size=1, max_length=8
+):
+    """Get output attributes of a module with input."""
+    input = torch.randn(batch_size, input_channels, max_length, input_dim)
+    output = module(input)
+    output_channels = output.shape[1]
+    output_dim = output.shape[-1]
+    return output_channels, output_dim
+
+
+# ------------------------------------------------------------------------------
+#   NoOp
+# ------------------------------------------------------------------------------
+
+
+class NoOp(torch.nn.Module):
+    """
+    NoOp simply passes the input as the output.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+# ------------------------------------------------------------------------------
+#   Permute: a torch.nn.Module applies permutation on the input tensor.
+# ------------------------------------------------------------------------------
+
+
+class Permute(torch.nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input.permute(self.dims).contiguous()
+
+
+# ------------------------------------------------------------------------------
+#   lengths_to_padding_mask()
+# ------------------------------------------------------------------------------
+
+
+def lengths_to_padding_mask(lengths: Tensor) -> Tensor:
+    """Convert lengths of shape (B, ) to padding mask."""
+    batch_size = lengths.shape[0]
+    max_length = int(torch.max(lengths).item())
+    padding_mask = torch.arange(  # [0, ..., T-1]
+        max_length, device=lengths.device, dtype=lengths.dtype
+    ).expand(batch_size, max_length) >= lengths.unsqueeze(1)
+
+    return padding_mask
+
+
+# ------------------------------------------------------------------------------
+#   lengths_to_attention_mask()
+# ------------------------------------------------------------------------------
+
+
+def lengths_to_attention_mask(
+    lengths: Tensor,
+    left_context: Optional[int] = None,
+    right_context: Optional[int] = None,
+) -> Optional[Tensor]:
+    """
+    Generate attention mask based on (lengths, left_context, right_context).
+    left_context is None means unlimited left context.
+    right_context is None means unlimited right context.
+    """
+
+    if left_context is None and right_context is None:
+        return None
+
+    max_length = int(torch.max(lengths).item())
+
+    # For example, with `max_length` == 5,
+    # indices = tensor([
+    #     [ 0,  1,  2,  3,  4,  5],
+    #     [-1,  0,  1,  2,  3,  4],
+    #     [-2, -1,  0,  1,  2,  3],
+    #     [-3, -2, -1,  0,  1,  2],
+    #     [-4, -3, -2, -1,  0,  1],
+    #     [-5, -4, -3, -2, -1,  0],
+    # ])
+
+    # In some cases the second torch.arange is created on cpu which causes a
+    # failure. Adding the device option to guard against it.
+    indices = torch.arange(
+        max_length, device=lengths.device, dtype=lengths.dtype
+    ).expand(max_length, max_length) - torch.arange(
+        max_length, device=lengths.device
+    ).view(
+        max_length, -1
+    )
+
+    # For example, with `max_length` == 5,
+    # bool_mask = tensor([
+    #     [True, True, True, True, True],
+    #     [True, True, True, True, True],
+    #     [True, True, True, True, True],
+    #     [True, True, True, True, True],
+    #     [True, True, True, True, True],
+    # ])
+    bool_mask = (
+        torch.tensor([True]).to(device=lengths.device).expand(max_length, max_length)
+    )
+
+    # For example, with `max_length` == 5, left_context == 2
+    # left_mask = tensor([
+    #     [ True,  True, True, True, True],
+    #     [ True,  True, True, True, True],
+    #     [ True,  True, True, True, True],
+    #     [False,  True, True, True, True],
+    #     [False, False, True, True, True],
+    # ])
+    if left_context is not None:
+        left_mask = indices >= -left_context
+        bool_mask = bool_mask & left_mask
+
+    # For example, with `max_length` == 5, right_context == 1
+    # right_mask = tensor([
+    #     [True, True, False, False, False],
+    #     [True, True,  True, False, False],
+    #     [True, True,  True,  True, False],
+    #     [True, True,  True,  True,  True],
+    #     [True, True,  True,  True,  True],
+    # ])
+    if right_context is not None:
+        right_mask = indices <= right_context
+        bool_mask = bool_mask & right_mask
+
+    bool_mask = (~bool_mask).to(device=lengths.device)
+    return bool_mask
+
+
+# ------------------------------------------------------------------------------
+#   infer_output_norm()
+# ------------------------------------------------------------------------------
+
+
+def infer_output_norm(module, output_norm=None):
+    """
+    Infer the output norm (string and module) needed on the module gvien desired
+    output normalization.
+    """
+    if output_norm == module.output_norm():
+        # output_norm already matches module.output_norm().
+        return (None, NoOp())
+
+    if output_norm is None and module.output_norm() is not None:
+        logger = logging.getLogger("infer_output_norm()")
+        logger.warning(
+            "trying to set output_norm ({}) ".format(output_norm)
+            + "but got module.output_norm() ({}), ".format(module.output_norm())
+            + "the combined output_norm() will be ({})".format(module.output_norm())
+        )
+        return (None, NoOp())
+
+    if output_norm == "log_softmax":
+        if module.output_norm() is not None:
+            raise ValueError(
+                "incompatible output_norm ({}) ".format(output_norm)
+                + "and module.output_norm() ({})".format(module.output_norm())
+            )
+        else:
+            return ("log_softmax", torch.nn.LogSoftmax(dim=-1))
+
+    if output_norm == "softmax":
+        if module.output_norm() is not None:
+            raise ValueError(
+                "incompatible output_norm ({}) ".format(output_norm)
+                + "and module.output_norm() ({})".format(module.output_norm())
+            )
+        else:
+            return ("softmax", torch.nn.Softmax(dim=-1))
+
+    raise ValueError(
+        "output_norm ({}) not in ".format(output_norm)
+        + "supported list = [None, softmax, log_softmax]"
+    )
+
+
+# ------------------------------------------------------------------------------
+#   infer_channels_from_layout()
+# ------------------------------------------------------------------------------
+
+
+def infer_channels_from_layout(layout, channels):
+    """Extract the number of channels from the layout."""
+    if layout in ("TBD", "BTD"):
+        if channels is not None and channels != 1:
+            raise ValueError(
+                "Expected channels ({}) to be 1 for layout = {}".format(
+                    channels, layout
+                )
+            )
+        if channels is None:
+            return 1
+    return channels
+
+
+# ------------------------------------------------------------------------------
+#   pad_sequence()
+# ------------------------------------------------------------------------------
+
+
+@torch.jit.export
+def pad_sequence(
+    sequence: Tensor,
+    time_axis: int,
+    extra_left_context: int = 0,
+    extra_right_context: int = 0,
+) -> Tensor:
+    """Pad extra left/right contexts to the sequence."""
+
+    if extra_left_context == 0 and extra_right_context == 0:
+        return sequence
+
+    tensors_to_concat = []
+
+    if extra_left_context:
+        size = (extra_left_context,)
+        fill_value = 0
+        indices = torch.full(
+            size=size,
+            fill_value=fill_value,
+            dtype=torch.long,
+            device=sequence.device,
+        )
+        left_padding = torch.index_select(sequence, time_axis, indices)
+        tensors_to_concat.append(left_padding)
+
+    tensors_to_concat.append(sequence)
+
+    # NOTE(cfyeh): for efficiency reason we pad 0 instead of the last frame for
+    #              extra right contexts.
+    if extra_right_context:
+        size = list(sequence.shape)
+        size[time_axis] = extra_right_context
+        right_padding = torch.zeros(size, dtype=sequence.dtype, device=sequence.device)
+        tensors_to_concat.append(right_padding)
+
+    padded_sequence = torch.cat(tensors_to_concat, dim=time_axis)
+    return padded_sequence
+
+
+# ------------------------------------------------------------------------------
+#   sequence_to_segments()
+# ------------------------------------------------------------------------------
+
+
+@torch.jit.export
+def sequence_to_segments(
+    sequence: Tensor,
+    time_axis: int,
+    lengths: Tensor,
+    segment_size: Optional[int] = None,
+    extra_left_context: int = 0,
+    extra_right_context: int = 0,
+) -> List[Tuple[Tensor, Tensor]]:
+    """Breaks sequence into segments."""
+
+    sequence = pad_sequence(
+        sequence=sequence,
+        time_axis=time_axis,
+        extra_left_context=extra_left_context,
+        extra_right_context=extra_right_context,
+    )
+
+    lengths = lengths + extra_left_context + extra_right_context
+
+    segments: List[Tuple[Tensor, Tensor]] = []
+
+    if segment_size is None:
+        segments.append((sequence, lengths))
+        return segments
+
+    offset = 0
+    end = sequence.shape[time_axis]
+    step = segment_size
+    size = extra_left_context + segment_size + extra_right_context
+
+    while offset + extra_left_context + extra_right_context < end:
+        clamped_size = min(size, end - offset)
+        segment_lengths = torch.clamp(lengths - offset, min=0, max=clamped_size)
+        indices = torch.arange(
+            start=offset,
+            end=(offset + clamped_size),
+            step=1,
+            dtype=torch.long,
+            device=sequence.device,
+        )
+        segment_tensor = torch.index_select(sequence, time_axis, indices)
+        segments.append((segment_tensor, segment_lengths))
+        offset = offset + step
+
+    return segments
+
+
+# ------------------------------------------------------------------------------
+#   segments_to_sequence()
+# ------------------------------------------------------------------------------
+
+
+@torch.jit.export
+def segments_to_sequence(
+    segments: List[Tuple[Tensor, Tensor]], time_axis: int
+) -> Tuple[Tensor, Tensor]:
+    """Concatenate segments into a full sequence."""
+    if len(segments) == 1:
+        return segments[0]
+
+    tensors_to_concat: List[Tensor] = []
+    lengths_to_stack: List[Tensor] = []
+
+    for tensor, lengths in segments:
+        tensors_to_concat.append(tensor)
+        lengths_to_stack.append(lengths)
+
+    sequence = torch.cat(tensors_to_concat, dim=time_axis)
+    lengths = torch.stack(lengths_to_stack, dim=0)
+    lengths = torch.sum(lengths, dim=0)
+
+    return sequence, lengths
+
+
+def lengths_to_encoder_padding_mask(lengths, batch_first: bool = False):
+    """
+    convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor
+
+    Args:
+        lengths: a (B, )-shaped tensor
+        batch_first: whether to return a (B, T) tensor
+
+    Return:
+        max_length: maximum length of B sequences
+        encoder_padding_mask: a (max_length, B) binary mask, where
+        [t, b] = False for t < lengths[b] and True otherwise
+
+    TODO:
+        kernelize this function if benchmarking shows this function is slow
+    """
+    max_lengths = torch.max(lengths).item()
+    bsz = lengths.size(0)
+    encoder_padding_mask = torch.arange(
+        max_lengths
+    ).to(  # a (T, ) tensor with [0, ..., T-1]
+        lengths.device
+    ).view(  # move to the right device
+        1, max_lengths
+    ).expand(  # reshape to (1, T)-shaped tensor
+        bsz, -1
+    ) > lengths.view(  # expand to (B, T)-shaped tensor
+        bsz, 1
+    ).expand(
+        -1, max_lengths
+    )
+    if not batch_first:
+        return encoder_padding_mask.t(), max_lengths
+    else:
+        return encoder_padding_mask, max_lengths
+
+
+# ------------------------------------------------------------------------------
+#   attention suppression
+# ------------------------------------------------------------------------------
+
+
+def attention_suppression(attention_weights: Tensor, scale: float):
+    # B, H, qlen, klen -> B, H, qlen, 1
+    attention_prob = torch.nn.functional.softmax(attention_weights.float(), dim=-1)
+    attention_nozeros = attention_prob.to(torch.bool)
+    nozeros_sum = torch.sum(attention_nozeros.to(torch.float), dim=-1, keepdim=True)
+
+    # For very sparse situation, we need get round about 0s
+    key_sum = torch.sum(attention_prob, dim=-1, keepdim=True)
+
+    # nozeros_sum should > 1
+    key_mean = key_sum / (nozeros_sum + 1e-8)
+
+    # std calculation
+    dis = (attention_prob - key_mean) * (attention_prob - key_mean)
+
+    # if attention_prob[i] < threshold, then dis_masked[i] = 0; for all i
+    dis_masked = torch.where(
+        attention_nozeros, dis, attention_prob.new_zeros(attention_prob.size())
+    )
+
+    key_var = torch.sum(dis_masked, dim=-1, keepdim=True)
+    key_var = key_var / (nozeros_sum - 1.0 + 1e-8)
+    key_std = torch.sqrt(key_var)
+    key_thread = key_mean - scale * key_std
+
+    # if attention_prob[i] >= key_thread, then attention_prob[i]
+    # , otherwise "-inf"
+    inf_tensor = attention_prob.new_zeros(attention_prob.size()).detach()
+    inf_tensor[:] = float("-inf")
+    attention_weights_float = torch.where(
+        attention_prob < key_thread,
+        inf_tensor,
+        attention_weights.float(),
+    )
+
+    return attention_weights_float.type_as(attention_weights)
+
+
+def layer_norm_backward_hook(module, grad_input, grad_output, clamp_value):
+    return tuple(torch.clamp(v, min=-clamp_value, max=clamp_value) for v in grad_input)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
new file mode 100644
index 0000000000..7b4b234641
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
@@ -0,0 +1,855 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import checkpoint_utils, utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqEncoderModel,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder
+from fairseq.models.speech_to_text.hub_interface import S2THubInterface
+from fairseq.models.transformer import (
+    Embedding,
+    TransformerDecoder,
+    TransformerModelBase,
+)
+from fairseq.models.wav2vec import Wav2VecEncoder
+from fairseq.modules.layer_norm import LayerNorm
+
+logger = logging.getLogger(__name__)
+
+
+def build_embedding(dictionary, embed_dim):
+    num_embeddings = len(dictionary)
+    padding_idx = dictionary.pad()
+    return Embedding(num_embeddings, embed_dim, padding_idx)
+
+
+class Conv1dAdaptor(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        n_layers=3,
+        kernel_size=3,
+        stride=2,
+        layerdrop=0.0,
+        layernorm=False,
+        proj=False,
+    ):
+        super().__init__()
+        self.proj, self.proj_ln = None, None
+        self.post_proj, self.post_proj_ln = None, None
+        if proj:
+            self.proj = nn.Sequential(
+                nn.Linear(in_dim, in_dim * 4), nn.ReLU(), nn.Linear(in_dim * 4, in_dim)
+            )
+            self.proj_ln = LayerNorm(in_dim)
+            self.post_proj = nn.Sequential(
+                nn.Linear(out_dim, out_dim * 4),
+                nn.ReLU(),
+                nn.Linear(out_dim * 4, out_dim),
+            )
+            self.post_proj_ln = LayerNorm(out_dim)
+
+        self.layers = nn.ModuleList(
+            nn.Conv1d(
+                in_dim if i == 0 else out_dim,
+                out_dim * 2,
+                kernel_size,
+                stride=stride,
+                padding=kernel_size // 2,
+            )
+            for i in range(n_layers)
+        )
+        self.stride = stride
+        self.layerdrop = layerdrop
+        self.layernorm = LayerNorm(in_dim) if layernorm else None
+
+    @classmethod
+    def add_args(cls, parser):
+        parser.add_argument("--adaptor-n-layers", type=int)
+        parser.add_argument("--adaptor-kernel-size", type=int)
+        parser.add_argument("--adaptor-stride", type=int)
+        parser.add_argument("--adaptor-layerdrop", type=float)
+        parser.add_argument("--adaptor-layernorm", action="store_true")
+        parser.add_argument("--adaptor-proj", action="store_true")
+
+    def forward(self, x, padding_mask: Optional[torch.Tensor]):
+        if self.layernorm is not None:
+            x = self.layernorm(x)
+
+        if self.proj is not None:
+            x = x + 0.5 * self.proj(x)
+            x = self.proj_ln(x)
+
+        if padding_mask is not None:
+            x = utils.index_put(x, padding_mask.T, 0)
+
+        # T x B x C -> B x C x T
+        x = x.transpose(0, 1).transpose(1, 2)
+        out_lens = None
+        if padding_mask is not None:
+            out_lens = (~padding_mask).sum(1).float()
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                x = nn.functional.glu(layer(x), dim=1)
+                if padding_mask is not None:
+                    out_lens = ((out_lens - 1) / self.stride + 1).floor()
+        # B x C x T -> T x B x C
+        x = x.transpose(1, 2).transpose(0, 1)
+
+        if self.post_proj is not None:
+            x = x + 0.5 * self.post_proj(x)
+            x = self.post_proj_ln(x)
+
+        out_padding_mask = None
+        if padding_mask is not None:
+            out_padding_mask = lengths_to_padding_mask(out_lens.long())
+            x = utils.index_put(x, out_padding_mask.T, 0)
+        return x, out_padding_mask
+
+
+def add_wav2vec_asr_args(parser):
+    parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model")
+    parser.add_argument(
+        "--no-pretrained-weights",
+        action="store_true",
+        help="if true, does not load pretrained weights",
+    )
+    parser.add_argument(
+        "--dropout-input",
+        type=float,
+        metavar="D",
+        help="dropout to apply to the input (after feat extr)",
+    )
+    parser.add_argument(
+        "--final-dropout",
+        type=float,
+        metavar="D",
+        help="dropout after transformer and before final projection",
+    )
+    parser.add_argument(
+        "--apply-mask", action="store_true", help="apply masking during fine-tuning"
+    )
+    parser.add_argument(
+        "--dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability inside wav2vec 2.0 model",
+    )
+    parser.add_argument(
+        "--attention-dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability for attention weights inside wav2vec 2.0 model",
+    )
+    parser.add_argument(
+        "--activation-dropout",
+        "--relu-dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability after activation in FFN inside wav2vec 2.0 model",
+    )
+    parser.add_argument(
+        "--mask-length", type=int, help="repeat the mask indices multiple times"
+    )
+    parser.add_argument(
+        "--mask-prob", type=float, help="probability of replacing a token with mask"
+    )
+    parser.add_argument(
+        "--mask-selection",
+        type=str,
+        choices=["static", "uniform", "normal", "poisson"],
+        help="how to choose masks",
+    )
+    parser.add_argument(
+        "--mask-other",
+        type=float,
+        help="stdev of the mask length in case of 'normal' selection strategy",
+    )
+    parser.add_argument(
+        "--no-mask-overlap",
+        action="store_true",
+        help="whether to allow masks to overlap",
+    )
+    parser.add_argument(
+        "--mask-channel-length", type=int, help="repeat the mask indices multiple times"
+    )
+    parser.add_argument(
+        "--mask-channel-prob",
+        type=float,
+        help="probability of replacing a token with mask",
+    )
+    parser.add_argument(
+        "--mask-channel-selection",
+        type=str,
+        choices=["static", "uniform", "normal", "poisson"],
+        help="how to choose masks",
+    )
+    parser.add_argument(
+        "--mask-channel-other",
+        type=float,
+        help="stdev of the mask length in case of 'normal' selection strategy",
+    )
+    parser.add_argument(
+        "--no-mask-channel-overlap",
+        action="store_true",
+        help="whether to allow masks to overlap",
+    )
+    parser.add_argument(
+        "--freeze-finetune-updates",
+        type=int,
+        metavar="N",
+        help="dont finetune wav2vec for this many updates",
+    )
+    parser.add_argument(
+        "--feature-grad-mult",
+        type=float,
+        metavar="D",
+        help="reset feature grad mult in wav2vec 2.0 to this",
+    )
+    parser.add_argument(
+        "--layerdrop",
+        type=float,
+        metavar="D",
+        help="probability of dropping a layer in wav2vec 2.0",
+    )
+    parser.add_argument(
+        "--max-positions",
+        type=int,
+        metavar="N",
+        help="Max input positions to be used in the conformer encoder in wav2vec 2.0",
+    )
+    parser.add_argument("--encoder-proj", action="store_true")
+    parser.add_argument("--w2v-args", default=None)
+    parser.add_argument(
+        "--remove-weight-norm",
+        action="store_true",
+        help="if set, then the weight-norm (in one pos_conv layer) is removed from the model",
+    )
+    parser.add_argument(
+        "--encoder-embed-dim",
+        type=int,
+        metavar="N",
+        help="encoder embedding dimension to be used when w2v_path is None and no encoder_proj is set",
+    )
+
+
+def need_finetuning(ft_params, param_name):
+    if ft_params == "all":
+        return True
+    ft_params_list = ft_params.split(",")
+    for ft_param in ft_params_list:
+        if ft_param in param_name:
+            return True
+    return False
+
+
+class Wav2VecEncoderWithAdaptor(FairseqEncoder):
+    def build_adaptor(self, args):
+        adaptor = None
+        if args.adaptor_n_layers > 0:
+            adaptor = Conv1dAdaptor(
+                args.decoder_embed_dim,
+                args.decoder_embed_dim,
+                n_layers=args.adaptor_n_layers,
+                kernel_size=args.adaptor_kernel_size,
+                stride=args.adaptor_stride,
+                layerdrop=args.adaptor_layerdrop,
+                layernorm=args.adaptor_layernorm,
+                proj=args.adaptor_proj,
+            )
+        return adaptor
+
+    def __init__(self, args):
+        super().__init__(None)
+        self.w2v_encoder = Wav2VecEncoder(args)
+        self.is_v0_arch = not args.adaptor_proj
+        self.w2v_proj_ln = None
+        if not self.is_v0_arch and self.w2v_encoder.proj is not None:
+            self.w2v_proj_ln = LayerNorm(args.decoder_embed_dim)
+        self.adaptor = self.build_adaptor(args)
+
+        self.num_updates = 0
+        self.freezing_updates = args.w2v_freezing_updates
+        self.finetuning_params = args.finetune_w2v_params
+        for k, p in self.w2v_encoder.w2v_model.named_parameters():
+            p.requires_grad = need_finetuning(self.finetuning_params, k)
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add model-specific arguments to the parser."""
+        add_wav2vec_asr_args(parser)
+        parser.add_argument(
+            "--normalize",
+            action="store_true",
+            help="if set, normalizes input to have 0 mean and unit variance",
+        )
+        parser.add_argument(
+            "--finetune-w2v-params",
+            type=str,
+            metavar="STR",
+            help="comma-separated param strings to finetune.",
+        )
+        parser.add_argument("--w2v-freezing-updates", type=int)
+        parser.add_argument("--load-pretrained-encoder-from", type=str, metavar="STR")
+        Conv1dAdaptor.add_args(parser)
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+
+    def forward(self, src_tokens, src_lengths=None, **kwargs):
+        if (
+            self.freezing_updates is not None
+            and self.num_updates > self.freezing_updates
+        ):
+            for p in self.w2v_encoder.w2v_model.parameters():
+                p.requires_grad = True
+
+        padding_mask = lengths_to_padding_mask(src_lengths)
+        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
+        x, padding_mask = out["encoder_out"], out["padding_mask"]
+        if self.w2v_proj_ln is not None:
+            x = self.w2v_proj_ln(x)
+
+        if self.adaptor is not None:
+            x, padding_mask = self.adaptor(x, padding_mask)
+
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": []
+            if padding_mask is None
+            else [padding_mask],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": [],  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        new_encoder_out = (
+            []
+            if len(encoder_out["encoder_out"]) == 0
+            else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
+        )
+
+        new_encoder_padding_mask = (
+            []
+            if len(encoder_out["encoder_padding_mask"]) == 0
+            else [
+                x.index_select(0, new_order)
+                for x in encoder_out["encoder_padding_mask"]
+            ]
+        )
+
+        new_encoder_embedding = (
+            []
+            if len(encoder_out["encoder_embedding"]) == 0
+            else [
+                x.index_select(0, new_order) for x in encoder_out["encoder_embedding"]
+            ]
+        )
+
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],  # B x T
+            "src_lengths": [],  # B x 1
+        }
+
+
+def add_decoder_args(parser):
+    parser.add_argument(
+        "--activation-fn",
+        type=str,
+        default="relu",
+        choices=utils.get_available_activation_fns(),
+        help="activation function to use",
+    )
+    parser.add_argument(
+        "--decoder-dropout", type=float, metavar="D", help="dropout probability"
+    )
+    parser.add_argument(
+        "--decoder-attention-dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability for attention weights",
+    )
+    parser.add_argument(
+        "--decoder-activation-dropout",
+        type=float,
+        metavar="D",
+        help="dropout probability after activation in FFN.",
+    )
+    parser.add_argument(
+        "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension"
+    )
+    parser.add_argument(
+        "--decoder-ffn-embed-dim",
+        type=int,
+        metavar="N",
+        help="decoder embedding dimension for FFN",
+    )
+    parser.add_argument(
+        "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+    )
+    parser.add_argument(
+        "--decoder-attention-heads",
+        type=int,
+        metavar="N",
+        help="num decoder attention heads",
+    )
+    parser.add_argument(
+        "--decoder-normalize-before",
+        action="store_true",
+        help="apply layernorm before each decoder block",
+    )
+    parser.add_argument(
+        "--layernorm-embedding", action="store_true", help="add layernorm to embedding"
+    )
+    parser.add_argument(
+        "--decoder-layerdrop",
+        type=float,
+        metavar="D",
+        help="layerdrop probability for decoder",
+    )
+    parser.add_argument(
+        "--decoder-learned-pos",
+        action="store_true",
+        help="learn positional embedding in decoder",
+    )
+    parser.add_argument(
+        "--share-decoder-input-output-embed",
+        action="store_true",
+        help="share decoder input and output embeddings",
+    )
+    parser.add_argument(
+        "--no-scale-embedding",
+        action="store_true",
+        help="if True, dont scale embeddings",
+    )
+    parser.add_argument(
+        "--load-pretrained-decoder-from",
+        type=str,
+        metavar="STR",
+        help="model to take decoder weights from (for initialization)",
+    )
+    parser.add_argument(
+        "--finetune-decoder-params",
+        type=str,
+        metavar="STR",
+        help="comma-separated param strings to finetune.",
+    )
+
+
+def remove_weight_norm_from_model(model):
+    from functools import reduce
+
+    layers_with_wn = []
+    for param_name, _ in model.named_parameters():
+        if param_name.endswith("_g"):
+            # retrieve the module with this param_name
+            module_names = param_name.split(".")[
+                :-1
+            ]  # exclude the actual parameter name
+            wn_module = reduce(getattr, module_names, model)
+            layers_with_wn.append(wn_module)
+    for wn_module in layers_with_wn:
+        torch.nn.utils.remove_weight_norm(wn_module)
+        logger.warning(f"Weight norm removed from module with {wn_module}\n")
+
+
+@register_model("xm_transformer")
+class XMTransformerModel(FairseqEncoderDecoderModel):
+    @classmethod
+    def hub_models(cls):
+        base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t"
+        model_ids = [
+            "xm_transformer_600m-es_en-multi_domain",
+            "xm_transformer_600m-ru_en-multi_domain",
+            "xm_transformer_600m-fr_en-multi_domain",
+            "xm_transformer_600m-en_es-multi_domain",
+            "xm_transformer_600m-en_ru-multi_domain",
+            "xm_transformer_600m-en_fr-multi_domain",
+            "xm_transformer_600m-en_zh-multi_domain",
+            "xm_transformer_600m-en_ar-multi_domain",
+            "xm_transformer_600m-en_tr-multi_domain",
+            "xm_transformer_600m-en_vi-multi_domain",
+            "xm_transformer-21_en-xls_r_300m",
+            "xm_transformer-en_15-xls_r_300m",
+            "xm_transformer-21_en-xls_r_1b",
+            "xm_transformer-en_15-xls_r_1b",
+            "xm_transformer-21_en-xls_r_2b",
+            "xm_transformer-en_15-xls_r_2b",
+            "xm_transformer-22_16-xls_r_2b",
+            "xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
+            "xm_transformer_s2ut_800m-en-es-st_plus_asr",
+            "xm_transformer_s2ut_800m-hk-en-h1_2022",
+            "xm_transformer_s2ut_800m-en-hk-h1_2022",
+        ]
+        return {i: f"{base_url}/{i}.tar.gz" for i in model_ids}
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        config_yaml="config.yaml",
+        task="speech_to_text",
+        generation_args=None,
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            config_yaml=config_yaml,
+            task=task,
+            generation_args=generation_args,
+            **kwargs,
+        )
+        return S2THubInterface(x["args"], x["task"], x["models"][0])
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add model-specific arguments to the parser."""
+        Wav2VecEncoderWithAdaptor.add_args(parser)
+        add_decoder_args(parser)
+        parser.add_argument("--checkpoint-activations", action="store_true")
+        parser.add_argument("--offload-activations", action="store_true")
+        parser.add_argument("--min-params-to-wrap", type=int, metavar="N")
+
+    @classmethod
+    def maybe_load_pretrained(cls, component, checkpoint: Optional[str] = None):
+        if checkpoint is None:
+            return component
+
+        _load = checkpoint_utils.load_pretrained_component_from_model
+        try:
+            return _load(component, checkpoint)
+        except RuntimeError as e:
+            logger.warning(e)
+            return _load(component, checkpoint, strict=False)
+
+    @classmethod
+    def build_encoder(cls, args):
+        _args = copy.deepcopy(args)
+        if not args.adaptor_proj and not args.encoder_proj:  # V0 arch
+            if args.w2v_path:
+                state = checkpoint_utils.load_checkpoint_to_cpu(args.w2v_path)
+                if state.get("cfg") is not None:
+                    encoder_embed_dim = state["cfg"]._content["model"][
+                        "encoder_embed_dim"
+                    ]
+                elif state.get("args") is not None:
+                    encoder_embed_dim = state["args"].encoder_embed_dim
+                else:
+                    raise ValueError(f"Invalid config in {args.w2v_path}")
+                _args.decoder_embed_dim = encoder_embed_dim
+                del state
+            else:
+                _args.decoder_embed_dim = args.encoder_embed_dim
+
+        encoder = Wav2VecEncoderWithAdaptor(_args)
+        encoder = cls.maybe_load_pretrained(
+            encoder, getattr(args, "load_pretrained_encoder_from", None)
+        )
+        if args.remove_weight_norm:
+            # remove the wn for EMA usage
+            logger.warning("Removing weight norm from wav2vec encoder")
+            remove_weight_norm_from_model(encoder)
+
+        return encoder
+
+    @classmethod
+    def get_decoder_args_from_checkpoint(cls, ckpt_args):
+        assert "model" in ckpt_args, "Model args not found in checkpoint cfg!"
+        decoder_args = {}
+        for k, v in ckpt_args["model"].__dict__.items():
+            if "decoder" in k:
+                decoder_args[k] = v
+
+        return decoder_args
+
+    @classmethod
+    def override_decoder_args(cls, cli_args, decoder_args_dict):
+        for k, v in decoder_args_dict.items():
+            if v != getattr(cli_args, k, None):
+                logger.warning(
+                    f"Overriding decoder arg {k}: from {getattr(cli_args, k, None)} to {v}"
+                )
+                setattr(cli_args, k, v)
+
+        return cli_args
+
+    @classmethod
+    def build_decoder(cls, args, task, embed_tokens):
+        _args = copy.deepcopy(args)
+        if args.adaptor_proj or args.encoder_proj:  # not V0 arch
+            _args.encoder_embed_dim = _args.decoder_embed_dim
+        _args.dropout = args.decoder_dropout
+        _args.attention_dropout = args.decoder_attention_dropout
+        _args.activation_dropout = args.decoder_activation_dropout
+        _args.layerdrop = _args.decoder_layerdrop
+
+        decoder = TransformerDecoder(_args, task.target_dictionary, embed_tokens)
+        decoder = cls.maybe_load_pretrained(
+            decoder, getattr(args, "load_pretrained_decoder_from", None)
+        )
+
+        for k, p in decoder.named_parameters():
+            p.requires_grad = need_finetuning(args.finetune_decoder_params, k)
+        return decoder
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+        if getattr(args, "load_pretrained_decoder_from", None) is not None:
+            ckpt = torch.load(getattr(args, "load_pretrained_decoder_from", None))
+            decoder_args_dict = cls.get_decoder_args_from_checkpoint(ckpt["cfg"])
+            args = cls.override_decoder_args(args, decoder_args_dict)
+
+        decoder_embed_tokens = build_embedding(
+            task.target_dictionary, args.decoder_embed_dim
+        )
+
+        encoder = cls.build_encoder(args)
+        decoder = cls.build_decoder(args, task, decoder_embed_tokens)
+        base_model = cls(encoder, decoder)
+
+        # set up multitask decoders
+        base_model.multitask_decoders = {}
+        for i, (task_name, task_obj) in enumerate(task.multitask_tasks.items()):
+            # dummy auxiliary decoder
+            if task_obj.args.get_loss_weight(0) == 0:
+                continue
+
+            task_decoder = cls.build_multitask_decoder(
+                args, task_obj.args, task_obj.target_dictionary, args.decoder_embed_dim
+            )
+
+            setattr(base_model, f"{task_name}_decoder", task_decoder)
+            decoder_model_cls = (
+                FairseqEncoderModel
+                if task_obj.args.decoder_type == "ctc"
+                else FairseqLanguageModel
+            )
+            base_model.multitask_decoders[task_name] = decoder_model_cls(
+                getattr(base_model, f"{task_name}_decoder")
+            )
+        return base_model
+
+    @classmethod
+    def build_multitask_decoder(
+        cls,
+        args,
+        mtl_args,
+        tgt_dict,
+        in_dim,
+        is_first_pass_decoder=False,
+    ):
+        decoder_args = mtl_args.decoder_args
+        decoder_args.encoder_embed_dim = in_dim
+        if mtl_args.decoder_type == "transformer":
+            if is_first_pass_decoder:
+                task_decoder = cls.build_text_decoder(args, tgt_dict)
+            else:
+                from fairseq.models.speech_to_speech import (
+                    base_multitask_text_transformer_decoder_arch,
+                )
+
+                base_multitask_text_transformer_decoder_arch(decoder_args)  # 2L
+                task_decoder = TransformerDecoder(
+                    decoder_args,
+                    tgt_dict,
+                    embed_tokens=TransformerModelBase.build_embedding(
+                        decoder_args,
+                        tgt_dict,
+                        decoder_args.decoder_embed_dim,
+                    ),
+                )
+        elif mtl_args.decoder_type == "ctc":
+            task_decoder = CTCDecoder(
+                dictionary=tgt_dict,
+                in_dim=in_dim,
+            )
+        else:
+            raise NotImplementedError(
+                "currently only support multitask decoder_type 'transformer', 'ctc'"
+            )
+
+        return task_decoder
+
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        return_all_hiddens=False,
+        **kwargs,
+    ):
+        """
+        The forward method inherited from the base class has a **kwargs
+        argument in its input, which is not supported in torchscript. This
+        method overwrites the forward method definition without **kwargs.
+        """
+        encoder_out = self.encoder(
+            src_tokens=src_tokens, src_lengths=src_lengths, **kwargs
+        )
+        decoder_out = self.decoder(
+            prev_output_tokens=prev_output_tokens, encoder_out=encoder_out
+        )
+        if return_all_hiddens:
+            decoder_out[-1]["encoder_states"] = encoder_out["encoder_out"]
+            # NOTE: from the top layer
+            decoder_out[-1]["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ]
+        return decoder_out
+
+    def upgrade_state_dict(self, state_dict):
+        for k, _ in state_dict.items():
+            if "adaptor.layers" in state_dict:
+                new = k.replace("adaptor.layers", "adaptor_layers")
+                state_dict[new] = state_dict[k]
+                del state_dict[k]
+
+
+def set_default_w2v_encoder_args(args):
+    args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False)
+    args.dropout_input = getattr(args, "dropout_input", 0)
+    args.final_dropout = getattr(args, "final_dropout", 0)
+    args.apply_mask = getattr(args, "apply_mask", False)
+    args.dropout = getattr(args, "dropout", 0)
+    args.attention_dropout = getattr(args, "attention_dropout", 0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0)
+    args.encoder_proj = getattr(args, "encoder_proj", False)
+    args.remove_weight_norm = getattr(args, "remove_weight_norm", False)
+
+    args.mask_length = getattr(args, "mask_length", 10)
+    args.mask_prob = getattr(args, "mask_prob", 0.5)
+    args.mask_selection = getattr(args, "mask_selection", "static")
+    args.mask_other = getattr(args, "mask_other", 0)
+    args.no_mask_overlap = getattr(args, "no_mask_overlap", False)
+    args.mask_channel_length = getattr(args, "mask_channel_length", 10)
+    args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5)
+    args.mask_channel_before = getattr(args, "mask_channel_before", False)
+    args.mask_channel_selection = getattr(args, "mask_channel_selection", "static")
+    args.mask_channel_other = getattr(args, "mask_channel_other", 0)
+    args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False)
+
+    args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0)
+    args.feature_grad_mult = 0.1
+    args.layerdrop = getattr(args, "layerdrop", 0.0)
+
+    args.normalize = getattr(args, "normalize", False)
+    args.finetune_w2v_params = getattr(args, "finetune_w2v_params", "all")
+    args.w2v_freezing_updates = getattr(args, "w2v_freezing_updates", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+
+
+def set_default_adaptor_args(args):
+    args.adaptor_n_layers = getattr(args, "adaptor_n_layers", 3)
+    args.adaptor_kernel_size = getattr(args, "adaptor_kernel_size", 3)
+    args.adaptor_stride = getattr(args, "adaptor_stride", 2)
+    args.adaptor_layerdrop = getattr(args, "adaptor_layerdrop", 0.0)
+    args.adaptor_layernorm = getattr(args, "adaptor_layernorm", False)
+    args.adaptor_proj = getattr(args, "adaptor_proj", False)
+
+
+def set_default_transformer_decoder_args(args):
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4 * 1024)
+    args.decoder_layers = getattr(args, "decoder_layers", 12)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0.0)
+    args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0.0)
+    args.decoder_dropout = getattr(args, "decoder_dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
+
+    args.finetune_decoder_params = getattr(args, "finetune_decoder_params", "all")
+
+
+def set_default_general_args(args):
+    args.checkpoint_activations = getattr(args, "checkpoint_activations", False)
+    args.offload_activations = getattr(args, "offload_activations", False)
+    args.min_params_to_wrap = getattr(args, "min_params_to_wrap", int(1e8))
+    args.max_positions = getattr(args, "max_positions", 3000)
+
+
+@register_model_architecture(model_name="xm_transformer", arch_name="xm_transformer")
+def base_architecture(args):
+    set_default_general_args(args)
+    set_default_w2v_encoder_args(args)
+    set_default_adaptor_args(args)
+    set_default_transformer_decoder_args(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
new file mode 100644
index 0000000000..f77ef4e570
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
@@ -0,0 +1,315 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+
+from fairseq.models import (
+    FairseqEncoderModel,
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder
+from fairseq.models.speech_to_speech.modules.transformer_encoder import (
+    TransformerEncoderNoEmb,
+)
+from fairseq.models.speech_to_text.xm_transformer import XMTransformerModel
+from fairseq.models.speech_to_text.xm_transformer import (
+    base_architecture as xm_t_base_architecture,
+)
+from fairseq.models.speech_to_text.xm_transformer import (
+    build_embedding,
+    need_finetuning,
+    set_default_adaptor_args,
+    set_default_general_args,
+    set_default_transformer_decoder_args,
+    set_default_w2v_encoder_args,
+)
+from fairseq.models.transformer import Linear, TransformerDecoder, TransformerModelBase
+from fairseq.models.transformer.transformer_decoder_aug import AugTransformerDecoder
+
+logger = logging.getLogger(__name__)
+
+
+def unit_transformer_decoder_arch_base(
+    args, decoder_layers=6, decoder_embed_dim=768, decoder_attention_heads=12
+):
+    args.encoder_layers = decoder_layers
+    args.decoder_layers = decoder_layers
+    args.decoder_embed_dim = decoder_embed_dim
+    args.decoder_ffn_embed_dim = decoder_embed_dim * 4
+    args.decoder_attention_heads = decoder_attention_heads
+    args.encoder_embed_dim = args.decoder_embed_dim
+    args.decoder_output_dim = decoder_embed_dim
+    args.decoder_input_dim = decoder_embed_dim
+
+
+def unit_transformer_decoder_arch_large(
+    args, decoder_layers=12, decoder_embed_dim=1024, decoder_attention_heads=16
+):
+    args.encoder_layers = decoder_layers
+    args.decoder_layers = decoder_layers
+    args.decoder_embed_dim = decoder_embed_dim
+    args.decoder_ffn_embed_dim = decoder_embed_dim * 4
+    args.decoder_attention_heads = decoder_attention_heads
+    args.encoder_embed_dim = args.decoder_embed_dim
+    args.decoder_output_dim = decoder_embed_dim
+    args.decoder_input_dim = decoder_embed_dim
+
+
+@register_model("unity_xm_transformer")
+class XMTransformerModelUnitY(XMTransformerModel):
+    @classmethod
+    def hub_models(cls):
+        base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t"
+        model_ids = []
+        return {i: f"{base_url}/{i}.tar.gz" for i in model_ids}
+
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add model-specific arguments to the parser."""
+        XMTransformerModel.add_args(parser)
+        parser.add_argument(
+            "--translation-decoder-layers",
+            type=int,
+            default=4,
+            metavar="N",
+            help="num decoder layers in the first-pass translation module",
+        )
+        parser.add_argument(
+            "--synthesizer-encoder-layers",
+            type=int,
+            default=0,
+            metavar="N",
+            help="num encoder layers in the second-pass synthesizer module",
+        )
+        parser.add_argument(
+            "--synthesizer-augmented-cross-attention",
+            action="store_true",
+            default=False,
+            help="augmented cross-attention over speech encoder output",
+        )
+        parser.add_argument(
+            "--load-pretrained-aux-decoder-from",
+            type=str,
+            metavar="STR",
+            help="model to take decoder weights from (for initialization)",
+        )
+
+    @classmethod
+    def build_text_decoder(cls, args, tgt_dict):
+        _args = copy.deepcopy(args)
+
+        if args.adaptor_proj or args.encoder_proj:  # not V0 arch
+            _args.encoder_embed_dim = _args.decoder_embed_dim
+        _args.dropout = args.decoder_dropout
+        _args.attention_dropout = args.decoder_attention_dropout
+        _args.activation_dropout = args.decoder_activation_dropout
+        _args.layerdrop = _args.decoder_layerdrop
+        _args.decoder_layers = _args.translation_decoder_layers
+
+        embed_tokens = build_embedding(tgt_dict, _args.decoder_embed_dim)
+        decoder = TransformerDecoder(_args, tgt_dict, embed_tokens)
+
+        if getattr(args, "load_pretrained_aux_decoder_from", None) is not None:
+            decoder = cls.maybe_load_pretrained(
+                decoder, getattr(args, "load_pretrained_aux_decoder_from", None)
+            )
+
+            for k, p in decoder.named_parameters():
+                p.requires_grad = need_finetuning(args.finetune_decoder_params, k)
+        return decoder
+
+    @classmethod
+    def build_decoder(cls, args, task, aug_attn=False):
+        _args = copy.deepcopy(args)
+        _args.layerdrop = 0.0  # turn off layerdrop for shallow layers
+
+        _args.encoder_embed_dim = args.decoder_embed_dim
+
+        proj = None
+        if args.decoder_embed_dim != _args.decoder_embed_dim:
+            proj = Linear(args.decoder_embed_dim, _args.decoder_embed_dim)
+
+        embed_tokens = build_embedding(task.target_dictionary, _args.decoder_embed_dim)
+        decoder_cls = AugTransformerDecoder if aug_attn else TransformerDecoder
+        decoder = decoder_cls(_args, task.target_dictionary, embed_tokens)
+
+        if getattr(args, "load_pretrained_decoder_from", None) is not None:
+            # load all layers first and then discard the bottom layers
+            embed_tokens = build_embedding(
+                task.target_dictionary, _args.decoder_embed_dim
+            )
+            decoder_tmp = decoder_cls(_args, task.target_dictionary, embed_tokens)
+            decoder_tmp = cls.maybe_load_pretrained(
+                decoder_tmp, getattr(_args, "load_pretrained_decoder_from", None)
+            )
+            state_dict = decoder_tmp.state_dict()
+            for k, p in decoder.named_parameters():
+                p.data = state_dict[k].data
+                p.requires_grad = need_finetuning(_args.finetune_decoder_params, k)
+            decoder.layers = decoder.layers[-_args.decoder_layers :]
+
+        return decoder, proj, _args
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        xm_t_base_architecture(args)
+
+        encoder = cls.build_encoder(args)
+        decoder, proj, unit_args = cls.build_decoder(
+            args,
+            task,
+            aug_attn=getattr(args, "synthesizer_augmented_cross_attention", False),
+        )
+        base_model = cls(encoder, decoder)
+        setattr(base_model, "proj", proj)
+
+        base_model.t2u_augmented_cross_attn = getattr(
+            args, "synthesizer_augmented_cross_attention", False
+        )
+
+        # set up multitask decoders
+        base_model.mt_task_name = None
+        base_model.multitask_decoders = {}
+        has_first_pass_decoder = False
+        for task_name, task_obj in task.multitask_tasks.items():
+            if task_obj.is_first_pass_decoder:
+                has_first_pass_decoder = True
+                base_model.mt_task_name = task_name
+
+            task_decoder = cls.build_multitask_decoder(
+                args,
+                task_obj.args,
+                task_obj.target_dictionary,
+                args.decoder_embed_dim,
+                task_obj.is_first_pass_decoder,
+            )
+
+            setattr(base_model, f"{task_name}_decoder", task_decoder)
+            decoder_model_cls = (
+                FairseqEncoderModel
+                if task_obj.args.decoder_type == "ctc"
+                else FairseqLanguageModel
+            )
+            base_model.multitask_decoders[task_name] = decoder_model_cls(
+                getattr(base_model, f"{task_name}_decoder")
+            )
+
+        assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder"
+
+        # set up encoder on top of the auxiliary MT decoder
+        if getattr(args, "synthesizer_encoder_layers", 0) > 0:
+            base_model.synthesizer_encoder = cls.build_t2u_encoder(unit_args)
+        else:
+            base_model.synthesizer_encoder = None
+
+        return base_model
+
+    @classmethod
+    def build_t2u_encoder(cls, args):
+        _args = copy.deepcopy(args)
+        _args.encoder_layers = _args.synthesizer_encoder_layers
+        _args.encoder_embed_dim = args.decoder_embed_dim
+        _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim
+        _args.encoder_attention_heads = args.decoder_attention_heads
+        _args.encoder_normalize_before = True
+        return TransformerEncoderNoEmb(_args)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        prev_output_tokens_mt,
+        return_all_hiddens=False,
+        tgt_speaker=None,
+        **kwargs,
+    ):
+        """
+        The forward method inherited from the base class has a **kwargs
+        argument in its input, which is not supported in torchscript. This
+        method overwrites the forward method definition without **kwargs.
+        """
+        encoder_out = self.encoder(
+            src_tokens=src_tokens, src_lengths=src_lengths, **kwargs
+        )
+
+        # 1. MT decoder
+        mt_decoder = getattr(self, f"{self.mt_task_name}_decoder")
+        mt_decoder_out = mt_decoder(
+            prev_output_tokens_mt,
+            encoder_out=encoder_out,
+        )
+        x = mt_decoder_out[1]["inner_states"][-1]
+        if mt_decoder.layer_norm is not None:
+            x = mt_decoder.layer_norm(x)
+        if self.proj is not None:
+            x = self.proj(x)
+
+        mt_decoder_padding_mask = None
+        if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any():
+            mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx)
+
+        # 2. T2U encoder
+        if self.synthesizer_encoder is not None:
+            t2u_encoder_out = self.synthesizer_encoder(
+                x,
+                mt_decoder_padding_mask,
+            )
+        else:
+            t2u_encoder_out = {
+                "encoder_out": [x],  # T x B x C
+                "encoder_padding_mask": [mt_decoder_padding_mask],  # B x T
+            }
+
+        # 3. T2U decoder
+        if self.t2u_augmented_cross_attn:
+            decoder_out = self.decoder(
+                prev_output_tokens,
+                encoder_out=encoder_out,
+                encoder_out_aug=t2u_encoder_out,
+            )
+        else:
+            decoder_out = self.decoder(
+                prev_output_tokens,
+                encoder_out=t2u_encoder_out,
+            )
+        if return_all_hiddens:
+            decoder_out[-1]["encoder_states"] = encoder_out["encoder_out"]
+            # NOTE: from the top layer
+            decoder_out[-1]["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ]
+        decoder_out[-1]["mt_decoder_out"] = mt_decoder_out
+        return decoder_out
+
+
+@register_model_architecture(
+    model_name="unity_xm_transformer", arch_name="unity_xm_transformer"
+)
+def base_architecture_unity(args):
+    set_default_general_args(args)
+    set_default_w2v_encoder_args(args)
+    set_default_adaptor_args(args)
+    set_default_transformer_decoder_args(args)
+
+    args.layernorm_embedding = False
+    args.decoder_learned_pos = False
+
+
+# for old models
+@register_model_architecture(
+    model_name="unity_xm_transformer", arch_name="xm_transformer_t2"
+)
+def base_architecture_unity_legacy(args):
+    base_architecture_unity(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
new file mode 100644
index 0000000000..c0dcd69b07
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .tacotron2 import *  # noqa
+from .tts_transformer import *  # noqa
+from .fastspeech2 import *  # noqa
+from .vocoder import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py
new file mode 100644
index 0000000000..d1574dd63f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py
@@ -0,0 +1,95 @@
+from argparse import Namespace
+import torch
+import torch.nn as nn
+
+from fairseq.models.text_to_speech.fastspeech2 import VariancePredictor
+from fairseq.models.text_to_speech.hifigan import Generator
+
+
+class CodeGenerator(Generator):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.dict = nn.Embedding(cfg["num_embeddings"], cfg["embedding_dim"])
+        self.multispkr = cfg.get("multispkr", None)
+        self.embedder = cfg.get("embedder_params", None)
+
+        if self.multispkr and not self.embedder:
+            self.spkr = nn.Embedding(cfg.get("num_speakers", 200), cfg["embedding_dim"])
+        elif self.embedder:
+            self.spkr = nn.Linear(cfg.get("embedder_dim", 256), cfg["embedding_dim"])
+
+        self.dur_predictor = None
+        if cfg.get("dur_predictor_params", None):
+            self.dur_predictor = VariancePredictor(
+                Namespace(**cfg["dur_predictor_params"])
+            )
+
+        self.f0 = cfg.get("f0", None)
+        n_f0_bin = cfg.get("f0_quant_num_bin", 0)
+        self.f0_quant_embed = (
+            None if n_f0_bin <= 0 else nn.Embedding(n_f0_bin, cfg["embedding_dim"])
+        )
+
+    @staticmethod
+    def _upsample(signal, max_frames):
+        if signal.dim() == 3:
+            bsz, channels, cond_length = signal.size()
+        elif signal.dim() == 2:
+            signal = signal.unsqueeze(2)
+            bsz, channels, cond_length = signal.size()
+        else:
+            signal = signal.view(-1, 1, 1)
+            bsz, channels, cond_length = signal.size()
+
+        signal = signal.unsqueeze(3).repeat(1, 1, 1, max_frames // cond_length)
+
+        # pad zeros as needed (if signal's shape does not divide completely with max_frames)
+        reminder = (max_frames - signal.shape[2] * signal.shape[3]) // signal.shape[3]
+        if reminder > 0:
+            raise NotImplementedError(
+                "Padding condition signal - misalignment between condition features."
+            )
+
+        signal = signal.view(bsz, channels, max_frames)
+        return signal
+
+    def forward(self, **kwargs):
+        x = self.dict(kwargs["code"]).transpose(1, 2)
+
+        if self.dur_predictor and kwargs.get("dur_prediction", False):
+            assert x.size(0) == 1, "only support single sample"
+            log_dur_pred = self.dur_predictor(x.transpose(1, 2))
+            dur_out = torch.clamp(
+                torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1
+            )
+            # B x C x T
+            x = torch.repeat_interleave(x, dur_out.view(-1), dim=2)
+
+        if self.f0:
+            if self.f0_quant_embed:
+                kwargs["f0"] = self.f0_quant_embed(kwargs["f0"].long()).transpose(1, 2)
+            else:
+                kwargs["f0"] = kwargs["f0"].unsqueeze(1)
+
+            if x.shape[-1] < kwargs["f0"].shape[-1]:
+                x = self._upsample(x, kwargs["f0"].shape[-1])
+            elif x.shape[-1] > kwargs["f0"].shape[-1]:
+                kwargs["f0"] = self._upsample(kwargs["f0"], x.shape[-1])
+            x = torch.cat([x, kwargs["f0"]], dim=1)
+
+        if self.multispkr:
+            assert (
+                "spkr" in kwargs
+            ), 'require "spkr" input for multispeaker CodeHiFiGAN vocoder'
+            spkr = self.spkr(kwargs["spkr"]).transpose(1, 2)
+            spkr = self._upsample(spkr, x.shape[-1])
+            x = torch.cat([x, spkr], dim=1)
+
+        for k, feat in kwargs.items():
+            if k in ["spkr", "code", "f0", "dur_prediction"]:
+                continue
+
+            feat = self._upsample(feat, x.shape[-1])
+            x = torch.cat([x, feat], dim=1)
+
+        return super().forward(x)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
new file mode 100644
index 0000000000..fb2d0df37d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
@@ -0,0 +1,448 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from torch import nn
+
+from fairseq import utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+from fairseq.models.text_to_speech.tacotron2 import Postnet
+from fairseq.modules import (
+    FairseqDropout,
+    LayerNorm,
+    MultiheadAttention,
+    PositionalEmbedding,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def model_init(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu"))
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx=None):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    return m
+
+
+class PositionwiseFeedForward(nn.Module):
+    def __init__(self, in_dim, hidden_dim, kernel_size, dropout):
+        super().__init__()
+        self.ffn = nn.Sequential(
+            nn.Conv1d(
+                in_dim,
+                hidden_dim,
+                kernel_size=kernel_size,
+                padding=(kernel_size - 1) // 2,
+            ),
+            nn.ReLU(),
+            nn.Conv1d(
+                hidden_dim,
+                in_dim,
+                kernel_size=kernel_size,
+                padding=(kernel_size - 1) // 2,
+            ),
+        )
+        self.layer_norm = LayerNorm(in_dim)
+        self.dropout = self.dropout_module = FairseqDropout(
+            p=dropout, module_name=self.__class__.__name__
+        )
+
+    def forward(self, x):
+        # B x T x C
+        residual = x
+        x = self.ffn(x.transpose(1, 2)).transpose(1, 2)
+        x = self.dropout(x)
+        return self.layer_norm(x + residual)
+
+
+class FFTLayer(torch.nn.Module):
+    def __init__(
+        self, embed_dim, n_heads, hidden_dim, kernel_size, dropout, attention_dropout
+    ):
+        super().__init__()
+        self.self_attn = MultiheadAttention(
+            embed_dim, n_heads, dropout=attention_dropout, self_attention=True
+        )
+        self.layer_norm = LayerNorm(embed_dim)
+        self.ffn = PositionwiseFeedForward(
+            embed_dim, hidden_dim, kernel_size, dropout=dropout
+        )
+
+    def forward(self, x, padding_mask=None):
+        # B x T x C
+        residual = x
+        x = x.transpose(0, 1)
+        x, _ = self.self_attn(
+            query=x, key=x, value=x, key_padding_mask=padding_mask, need_weights=False
+        )
+        x = x.transpose(0, 1)
+        x = self.layer_norm(x + residual)
+        return self.ffn(x)
+
+
+class LengthRegulator(nn.Module):
+    def forward(self, x, durations):
+        # x: B x T x C
+        out_lens = durations.sum(dim=1)
+        max_len = out_lens.max()
+        bsz, seq_len, dim = x.size()
+        out = x.new_zeros((bsz, max_len, dim))
+
+        for b in range(bsz):
+            indices = []
+            for t in range(seq_len):
+                indices.extend([t] * utils.item(durations[b, t]))
+            indices = torch.tensor(indices, dtype=torch.long).to(x.device)
+            out_len = utils.item(out_lens[b])
+            out[b, :out_len] = x[b].index_select(0, indices)
+
+        return out, out_lens
+
+
+class VariancePredictor(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(
+                args.encoder_embed_dim,
+                args.var_pred_hidden_dim,
+                kernel_size=args.var_pred_kernel_size,
+                padding=(args.var_pred_kernel_size - 1) // 2,
+            ),
+            nn.ReLU(),
+        )
+        self.ln1 = nn.LayerNorm(args.var_pred_hidden_dim)
+        self.dropout_module = FairseqDropout(
+            p=args.var_pred_dropout, module_name=self.__class__.__name__
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv1d(
+                args.var_pred_hidden_dim,
+                args.var_pred_hidden_dim,
+                kernel_size=args.var_pred_kernel_size,
+                padding=1,
+            ),
+            nn.ReLU(),
+        )
+        self.ln2 = nn.LayerNorm(args.var_pred_hidden_dim)
+        self.proj = nn.Linear(args.var_pred_hidden_dim, 1)
+
+    def forward(self, x):
+        # Input: B x T x C; Output: B x T
+        x = self.conv1(x.transpose(1, 2)).transpose(1, 2)
+        x = self.dropout_module(self.ln1(x))
+        x = self.conv2(x.transpose(1, 2)).transpose(1, 2)
+        x = self.dropout_module(self.ln2(x))
+        return self.proj(x).squeeze(dim=2)
+
+
+class VarianceAdaptor(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.length_regulator = LengthRegulator()
+        self.duration_predictor = VariancePredictor(args)
+        self.pitch_predictor = VariancePredictor(args)
+        self.energy_predictor = VariancePredictor(args)
+
+        n_bins, steps = self.args.var_pred_n_bins, self.args.var_pred_n_bins - 1
+        self.pitch_bins = torch.linspace(args.pitch_min, args.pitch_max, steps)
+        self.embed_pitch = Embedding(n_bins, args.encoder_embed_dim)
+        self.energy_bins = torch.linspace(args.energy_min, args.energy_max, steps)
+        self.embed_energy = Embedding(n_bins, args.encoder_embed_dim)
+
+    def get_pitch_emb(self, x, tgt=None, factor=1.0):
+        out = self.pitch_predictor(x)
+        bins = self.pitch_bins.to(x.device)
+        if tgt is None:
+            out = out * factor
+            emb = self.embed_pitch(torch.bucketize(out, bins))
+        else:
+            emb = self.embed_pitch(torch.bucketize(tgt, bins))
+        return out, emb
+
+    def get_energy_emb(self, x, tgt=None, factor=1.0):
+        out = self.energy_predictor(x)
+        bins = self.energy_bins.to(x.device)
+        if tgt is None:
+            out = out * factor
+            emb = self.embed_energy(torch.bucketize(out, bins))
+        else:
+            emb = self.embed_energy(torch.bucketize(tgt, bins))
+        return out, emb
+
+    def forward(
+        self,
+        x,
+        padding_mask,
+        durations=None,
+        pitches=None,
+        energies=None,
+        d_factor=1.0,
+        p_factor=1.0,
+        e_factor=1.0,
+    ):
+        # x: B x T x C
+        log_dur_out = self.duration_predictor(x)
+        dur_out = torch.clamp(
+            torch.round((torch.exp(log_dur_out) - 1) * d_factor).long(), min=0
+        )
+        dur_out.masked_fill_(padding_mask, 0)
+
+        pitch_out, pitch_emb = self.get_pitch_emb(x, pitches, p_factor)
+        x = x + pitch_emb
+        energy_out, energy_emb = self.get_energy_emb(x, energies, e_factor)
+        x = x + energy_emb
+
+        x, out_lens = self.length_regulator(
+            x, dur_out if durations is None else durations
+        )
+
+        return x, out_lens, log_dur_out, pitch_out, energy_out
+
+
+class FastSpeech2Encoder(FairseqEncoder):
+    def __init__(self, args, src_dict, embed_speaker):
+        super().__init__(src_dict)
+        self.args = args
+        self.padding_idx = src_dict.pad()
+        self.n_frames_per_step = args.n_frames_per_step
+        self.out_dim = args.output_frame_dim * args.n_frames_per_step
+
+        self.embed_speaker = embed_speaker
+        self.spk_emb_proj = None
+        if embed_speaker is not None:
+            self.spk_emb_proj = nn.Linear(
+                args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim
+            )
+
+        self.dropout_module = FairseqDropout(
+            p=args.dropout, module_name=self.__class__.__name__
+        )
+        self.embed_tokens = Embedding(
+            len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx
+        )
+
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions, args.encoder_embed_dim, self.padding_idx
+        )
+        self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+        self.dec_pos_emb_alpha = nn.Parameter(torch.ones(1))
+
+        self.encoder_fft_layers = nn.ModuleList(
+            FFTLayer(
+                args.encoder_embed_dim,
+                args.encoder_attention_heads,
+                args.fft_hidden_dim,
+                args.fft_kernel_size,
+                dropout=args.dropout,
+                attention_dropout=args.attention_dropout,
+            )
+            for _ in range(args.encoder_layers)
+        )
+
+        self.var_adaptor = VarianceAdaptor(args)
+
+        self.decoder_fft_layers = nn.ModuleList(
+            FFTLayer(
+                args.decoder_embed_dim,
+                args.decoder_attention_heads,
+                args.fft_hidden_dim,
+                args.fft_kernel_size,
+                dropout=args.dropout,
+                attention_dropout=args.attention_dropout,
+            )
+            for _ in range(args.decoder_layers)
+        )
+
+        self.out_proj = nn.Linear(args.decoder_embed_dim, self.out_dim)
+
+        self.postnet = None
+        if args.add_postnet:
+            self.postnet = Postnet(
+                self.out_dim,
+                args.postnet_conv_dim,
+                args.postnet_conv_kernel_size,
+                args.postnet_layers,
+                args.postnet_dropout,
+            )
+
+        self.apply(model_init)
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths=None,
+        speaker=None,
+        durations=None,
+        pitches=None,
+        energies=None,
+        **kwargs,
+    ):
+        x = self.embed_tokens(src_tokens)
+
+        enc_padding_mask = src_tokens.eq(self.padding_idx)
+        x += self.pos_emb_alpha * self.embed_positions(enc_padding_mask)
+        x = self.dropout_module(x)
+
+        for layer in self.encoder_fft_layers:
+            x = layer(x, enc_padding_mask)
+
+        if self.embed_speaker is not None:
+            bsz, seq_len, _ = x.size()
+            emb = self.embed_speaker(speaker).expand(bsz, seq_len, -1)
+            x = self.spk_emb_proj(torch.cat([x, emb], dim=2))
+
+        x, out_lens, log_dur_out, pitch_out, energy_out = self.var_adaptor(
+            x, enc_padding_mask, durations, pitches, energies
+        )
+
+        dec_padding_mask = lengths_to_padding_mask(out_lens)
+        x += self.dec_pos_emb_alpha * self.embed_positions(dec_padding_mask)
+        for layer in self.decoder_fft_layers:
+            x = layer(x, dec_padding_mask)
+
+        x = self.out_proj(x)
+        x_post = None
+        if self.postnet is not None:
+            x_post = x + self.postnet(x)
+        return x, x_post, out_lens, log_dur_out, pitch_out, energy_out
+
+
+@register_model("fastspeech2")
+class FastSpeech2Model(FairseqEncoderModel):
+    """
+    Implementation for https://arxiv.org/abs/2006.04558
+    """
+
+    NON_AUTOREGRESSIVE = True
+
+    @classmethod
+    def hub_models(cls):
+        base_url = "http://dl.fbaipublicfiles.com/fairseq/s2"
+        model_ids = [
+            "fastspeech2-en-ljspeech",
+            "fastspeech2-en-200_speaker-cv4",
+        ]
+        return {i: f"{base_url}/{i}.tar.gz" for i in model_ids}
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        config_yaml="config.yaml",
+        vocoder: str = "griffin_lim",
+        fp16: bool = False,
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            config_yaml=config_yaml,
+            vocoder=vocoder,
+            fp16=fp16,
+            **kwargs,
+        )
+        return TTSHubInterface(x["args"], x["task"], x["models"][0])
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument("--dropout", type=float)
+        parser.add_argument("--output-frame-dim", type=int)
+        parser.add_argument("--speaker-embed-dim", type=int)
+        # FFT blocks
+        parser.add_argument("--fft-hidden-dim", type=int)
+        parser.add_argument("--fft-kernel-size", type=int)
+        parser.add_argument("--attention-dropout", type=float)
+        parser.add_argument("--encoder-layers", type=int)
+        parser.add_argument("--encoder-embed-dim", type=int)
+        parser.add_argument("--encoder-attention-heads", type=int)
+        parser.add_argument("--decoder-layers", type=int)
+        parser.add_argument("--decoder-embed-dim", type=int)
+        parser.add_argument("--decoder-attention-heads", type=int)
+        # variance predictor
+        parser.add_argument("--var-pred-n-bins", type=int)
+        parser.add_argument("--var-pred-hidden-dim", type=int)
+        parser.add_argument("--var-pred-kernel-size", type=int)
+        parser.add_argument("--var-pred-dropout", type=float)
+        # postnet
+        parser.add_argument("--add-postnet", action="store_true")
+        parser.add_argument("--postnet-dropout", type=float)
+        parser.add_argument("--postnet-layers", type=int)
+        parser.add_argument("--postnet-conv-dim", type=int)
+        parser.add_argument("--postnet-conv-kernel-size", type=int)
+
+    def __init__(self, encoder, args, src_dict):
+        super().__init__(encoder)
+        self._num_updates = 0
+
+        out_dim = args.output_frame_dim * args.n_frames_per_step
+        self.ctc_proj = None
+        if getattr(args, "ctc_weight", 0.0) > 0.0:
+            self.ctc_proj = nn.Linear(out_dim, len(src_dict))
+
+    @classmethod
+    def build_model(cls, args, task):
+        embed_speaker = task.get_speaker_embeddings(args)
+        encoder = FastSpeech2Encoder(args, task.src_dict, embed_speaker)
+        return cls(encoder, args, task.src_dict)
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self._num_updates = num_updates
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        logits = self.ctc_proj(net_output[0])
+        if log_probs:
+            return utils.log_softmax(logits.float(), dim=-1)
+        else:
+            return utils.softmax(logits.float(), dim=-1)
+
+
+@register_model_architecture("fastspeech2", "fastspeech2")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.2)
+    args.output_frame_dim = getattr(args, "output_frame_dim", 80)
+    args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64)
+    # FFT blocks
+    args.fft_hidden_dim = getattr(args, "fft_hidden_dim", 1024)
+    args.fft_kernel_size = getattr(args, "fft_kernel_size", 9)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.encoder_layers = getattr(args, "encoder_layers", 4)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2)
+    args.decoder_layers = getattr(args, "decoder_layers", 4)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2)
+    # variance predictor
+    args.var_pred_n_bins = getattr(args, "var_pred_n_bins", 256)
+    args.var_pred_hidden_dim = getattr(args, "var_pred_hidden_dim", 256)
+    args.var_pred_kernel_size = getattr(args, "var_pred_kernel_size", 3)
+    args.var_pred_dropout = getattr(args, "var_pred_dropout", 0.5)
+    # postnet
+    args.add_postnet = getattr(args, "add_postnet", False)
+    args.postnet_dropout = getattr(args, "postnet_dropout", 0.5)
+    args.postnet_layers = getattr(args, "postnet_layers", 5)
+    args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512)
+    args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py
new file mode 100644
index 0000000000..a852beefec
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py
@@ -0,0 +1,179 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, weight_norm
+
+LRELU_SLOPE = 0.1
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return (kernel_size * dilation - dilation) // 2
+
+
+class ResBlock(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            remove_weight_norm(layer)
+        for layer in self.convs2:
+            remove_weight_norm(layer)
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, cfg):
+        super(Generator, self).__init__()
+        self.num_kernels = len(cfg["resblock_kernel_sizes"])
+        self.num_upsamples = len(cfg["upsample_rates"])
+        self.conv_pre = weight_norm(
+            Conv1d(
+                cfg.get("model_in_dim", 80),
+                cfg["upsample_initial_channel"],
+                7,
+                1,
+                padding=3,
+            )
+        )
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+            zip(cfg["upsample_rates"], cfg["upsample_kernel_sizes"])
+        ):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        cfg["upsample_initial_channel"] // (2**i),
+                        cfg["upsample_initial_channel"] // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = cfg["upsample_initial_channel"] // (2 ** (i + 1))
+            for k, d in zip(
+                cfg["resblock_kernel_sizes"], cfg["resblock_dilation_sizes"]
+            ):
+                self.resblocks.append(ResBlock(ch, k, d))
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for layer in self.ups:
+            remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
new file mode 100644
index 0000000000..e251c65c1d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
@@ -0,0 +1,188 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import random
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+logger = logging.getLogger(__name__)
+
+
+class TTSHubInterface(nn.Module):
+    def __init__(self, cfg, task, model):
+        super().__init__()
+        self.cfg = cfg
+        self.task = task
+        self.model = model
+        self.model.eval()
+
+        self.update_cfg_with_data_cfg(self.cfg, self.task.data_cfg)
+        self.generator = self.task.build_generator([self.model], self.cfg)
+
+    @classmethod
+    def phonemize(
+        cls,
+        text: str,
+        lang: Optional[str],
+        phonemizer: Optional[str] = None,
+        preserve_punct: bool = False,
+        to_simplified_zh: bool = False,
+    ):
+        if to_simplified_zh:
+            import hanziconv
+
+            text = hanziconv.HanziConv.toSimplified(text)
+
+        if phonemizer == "g2p":
+            import g2p_en
+
+            g2p = g2p_en.G2p()
+            if preserve_punct:
+                return " ".join("|" if p == " " else p for p in g2p(text))
+            else:
+                res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)]
+                return " ".join(p for p in res if p.isalnum())
+        if phonemizer == "g2pc":
+            import g2pc
+
+            g2p = g2pc.G2pC()
+            return " ".join([w[3] for w in g2p(text)])
+        elif phonemizer == "ipa":
+            assert lang is not None
+            import phonemizer
+            from phonemizer.separator import Separator
+
+            lang_map = {"en": "en-us", "fr": "fr-fr"}
+            return phonemizer.phonemize(
+                text,
+                backend="espeak",
+                language=lang_map.get(lang, lang),
+                separator=Separator(word="| ", phone=" "),
+            )
+        else:
+            return text
+
+    @classmethod
+    def tokenize(cls, text: str, tkn_cfg: Dict[str, str]):
+        sentencepiece_model = tkn_cfg.get("sentencepiece_model", None)
+        if sentencepiece_model is not None:
+            assert Path(sentencepiece_model).exists()
+            import sentencepiece as sp
+
+            spm = sp.SentencePieceProcessor()
+            spm.Load(sentencepiece_model)
+            return " ".join(spm.Encode(text, out_type=str))
+        else:
+            return text
+
+    @classmethod
+    def update_cfg_with_data_cfg(cls, cfg, data_cfg):
+        cfg["task"].vocoder = data_cfg.vocoder.get("type", "griffin_lim")
+
+    @classmethod
+    def get_model_input(
+        cls, task, text: str, speaker: Optional[int] = None, verbose: bool = False
+    ):
+        phonemized = cls.phonemize(
+            text,
+            task.data_cfg.hub.get("lang", None),
+            task.data_cfg.hub.get("phonemizer", None),
+            task.data_cfg.hub.get("preserve_punct", False),
+            task.data_cfg.hub.get("to_simplified_zh", False),
+        )
+        tkn_cfg = task.data_cfg.bpe_tokenizer
+        tokenized = cls.tokenize(phonemized, tkn_cfg)
+        if verbose:
+            logger.info(f"text: {text}")
+            logger.info(f"phonemized: {phonemized}")
+            logger.info(f"tokenized: {tokenized}")
+
+        spk = task.data_cfg.hub.get("speaker", speaker)
+        n_speakers = len(task.speaker_to_id or {})
+        if spk is None and n_speakers > 0:
+            spk = random.randint(0, n_speakers - 1)
+        if spk is not None:
+            spk = max(0, min(spk, n_speakers - 1))
+        if verbose:
+            logger.info(f"speaker: {spk}")
+        spk = None if spk is None else torch.Tensor([[spk]]).long()
+
+        src_tokens = task.src_dict.encode_line(tokenized, add_if_not_exist=False).view(
+            1, -1
+        )
+        src_lengths = torch.Tensor([len(tokenized.split())]).long()
+        return {
+            "net_input": {
+                "src_tokens": src_tokens,
+                "src_lengths": src_lengths,
+                "prev_output_tokens": None,
+            },
+            "target_lengths": None,
+            "speaker": spk,
+        }
+
+    @classmethod
+    def get_prediction(cls, task, model, generator, sample) -> Tuple[torch.Tensor, int]:
+        prediction = generator.generate(model, sample)
+        return prediction[0]["waveform"], task.sr
+
+    def predict(
+        self, text: str, speaker: Optional[int] = None, verbose: bool = False
+    ) -> Tuple[torch.Tensor, int]:
+        sample = self.get_model_input(self.task, text, speaker, verbose=verbose)
+        return self.get_prediction(self.task, self.model, self.generator, sample)
+
+
+class VocoderHubInterface(nn.Module):
+    """Vocoder interface to run vocoder models through hub. Currently we only support unit vocoder"""
+
+    def __init__(self, cfg, model):
+        super().__init__()
+        self.vocoder = model
+        self.vocoder.eval()
+        self.sr = 16000
+        self.multispkr = self.vocoder.model.multispkr
+        if self.multispkr:
+            logger.info("multi-speaker vocoder")
+            self.num_speakers = cfg.get(
+                "num_speakers",
+                200,
+            )  # following the default in codehifigan to set to 200
+
+    def get_model_input(
+        self,
+        text: str,
+        speaker: Optional[int] = -1,
+    ):
+        units = list(map(int, text.strip().split()))
+        x = {
+            "code": torch.LongTensor(units).view(1, -1),
+        }
+        if not speaker:
+            speaker = -1
+        if self.multispkr:
+            assert (
+                speaker < self.num_speakers
+            ), f"invalid --speaker-id ({speaker}) with total #speakers = {self.num_speakers}"
+            spk = random.randint(0, self.num_speakers - 1) if speaker == -1 else speaker
+            x["spkr"] = torch.LongTensor([spk]).view(1, 1)
+        return x
+
+    def get_prediction(self, sample, dur_prediction: Optional[bool] = True):
+        wav = self.vocoder(sample, dur_prediction)
+        return wav, self.sr
+
+    def predict(
+        self,
+        text: str,
+        speaker: Optional[int] = None,
+        dur_prediction: Optional[bool] = True,
+    ):
+        sample = self.get_model_input(text, speaker)
+        return self.get_prediction(sample, dur_prediction)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
new file mode 100644
index 0000000000..4df4075617
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
@@ -0,0 +1,380 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import LSTMCellWithZoneOut, LocationAttention
+
+
+logger = logging.getLogger(__name__)
+
+
+def encoder_init(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu"))
+
+
+class Tacotron2Encoder(FairseqEncoder):
+    def __init__(self, args, src_dict, embed_speaker):
+        super().__init__(src_dict)
+        self.padding_idx = src_dict.pad()
+        self.embed_speaker = embed_speaker
+        self.spk_emb_proj = None
+        if embed_speaker is not None:
+            self.spk_emb_proj = nn.Linear(
+                args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim
+            )
+
+        self.embed_tokens = nn.Embedding(
+            len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx
+        )
+
+        assert args.encoder_conv_kernel_size % 2 == 1
+        self.convolutions = nn.ModuleList(
+            nn.Sequential(
+                nn.Conv1d(
+                    args.encoder_embed_dim,
+                    args.encoder_embed_dim,
+                    kernel_size=args.encoder_conv_kernel_size,
+                    padding=((args.encoder_conv_kernel_size - 1) // 2),
+                ),
+                nn.BatchNorm1d(args.encoder_embed_dim),
+                nn.ReLU(),
+                nn.Dropout(args.encoder_dropout),
+            )
+            for _ in range(args.encoder_conv_layers)
+        )
+
+        self.lstm = nn.LSTM(
+            args.encoder_embed_dim,
+            args.encoder_embed_dim // 2,
+            num_layers=args.encoder_lstm_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+        self.apply(encoder_init)
+
+    def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs):
+        x = self.embed_tokens(src_tokens)
+        x = x.transpose(1, 2).contiguous()  # B x T x C -> B x C x T
+        for conv in self.convolutions:
+            x = conv(x)
+        x = x.transpose(1, 2).contiguous()  # B x C x T -> B x T x C
+
+        src_lengths = src_lengths.cpu().long()
+        x = nn.utils.rnn.pack_padded_sequence(x, src_lengths, batch_first=True)
+        x = self.lstm(x)[0]
+        x = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0]
+
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+
+        if self.embed_speaker is not None:
+            seq_len, bsz, _ = x.size()
+            emb = self.embed_speaker(speaker).expand(seq_len, bsz, -1)
+            x = self.spk_emb_proj(torch.cat([x, emb], dim=2))
+
+        return {
+            "encoder_out": [x],  # B x T x C
+            "encoder_padding_mask": encoder_padding_mask,  # B x T
+        }
+
+
+class Prenet(nn.Module):
+    def __init__(self, in_dim, n_layers, n_units, dropout):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            nn.Sequential(nn.Linear(in_dim if i == 0 else n_units, n_units), nn.ReLU())
+            for i in range(n_layers)
+        )
+        self.dropout = dropout
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = F.dropout(layer(x), p=self.dropout)  # always applies dropout
+        return x
+
+
+class Postnet(nn.Module):
+    def __init__(self, in_dim, n_channels, kernel_size, n_layers, dropout):
+        super(Postnet, self).__init__()
+        self.convolutions = nn.ModuleList()
+        assert kernel_size % 2 == 1
+        for i in range(n_layers):
+            cur_layers = (
+                [
+                    nn.Conv1d(
+                        in_dim if i == 0 else n_channels,
+                        n_channels if i < n_layers - 1 else in_dim,
+                        kernel_size=kernel_size,
+                        padding=((kernel_size - 1) // 2),
+                    ),
+                    nn.BatchNorm1d(n_channels if i < n_layers - 1 else in_dim),
+                ]
+                + ([nn.Tanh()] if i < n_layers - 1 else [])
+                + [nn.Dropout(dropout)]
+            )
+            nn.init.xavier_uniform_(
+                cur_layers[0].weight,
+                torch.nn.init.calculate_gain("tanh" if i < n_layers - 1 else "linear"),
+            )
+            self.convolutions.append(nn.Sequential(*cur_layers))
+
+    def forward(self, x):
+        x = x.transpose(1, 2)  # B x T x C -> B x C x T
+        for conv in self.convolutions:
+            x = conv(x)
+        return x.transpose(1, 2)
+
+
+def decoder_init(m):
+    if isinstance(m, torch.nn.Conv1d):
+        nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh"))
+
+
+class Tacotron2Decoder(FairseqIncrementalDecoder):
+    def __init__(self, args, src_dict):
+        super().__init__(None)
+        self.args = args
+        self.n_frames_per_step = args.n_frames_per_step
+        self.out_dim = args.output_frame_dim * args.n_frames_per_step
+
+        self.prenet = Prenet(
+            self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout
+        )
+
+        # take prev_context, prev_frame, (speaker embedding) as input
+        self.attention_lstm = LSTMCellWithZoneOut(
+            args.zoneout,
+            args.prenet_dim + args.encoder_embed_dim,
+            args.decoder_lstm_dim,
+        )
+
+        # take attention_lstm output, attention_state, encoder_out as input
+        self.attention = LocationAttention(
+            args.attention_dim,
+            args.encoder_embed_dim,
+            args.decoder_lstm_dim,
+            (1 + int(args.attention_use_cumprob)),
+            args.attention_conv_dim,
+            args.attention_conv_kernel_size,
+        )
+
+        # take attention_lstm output, context, (gated_latent) as input
+        self.lstm = nn.ModuleList(
+            LSTMCellWithZoneOut(
+                args.zoneout,
+                args.encoder_embed_dim + args.decoder_lstm_dim,
+                args.decoder_lstm_dim,
+            )
+            for i in range(args.decoder_lstm_layers)
+        )
+
+        proj_in_dim = args.encoder_embed_dim + args.decoder_lstm_dim
+        self.feat_proj = nn.Linear(proj_in_dim, self.out_dim)
+        self.eos_proj = nn.Linear(proj_in_dim, 1)
+
+        self.postnet = Postnet(
+            self.out_dim,
+            args.postnet_conv_dim,
+            args.postnet_conv_kernel_size,
+            args.postnet_layers,
+            args.postnet_dropout,
+        )
+
+        self.ctc_proj = None
+        if getattr(args, "ctc_weight", 0.0) > 0.0:
+            self.ctc_proj = nn.Linear(self.out_dim, len(src_dict))
+
+        self.apply(decoder_init)
+
+    def _get_states(self, incremental_state, enc_out):
+        bsz, in_len, _ = enc_out.size()
+        alstm_h = self.get_incremental_state(incremental_state, "alstm_h")
+        if alstm_h is None:
+            alstm_h = enc_out.new_zeros(bsz, self.args.decoder_lstm_dim)
+        alstm_c = self.get_incremental_state(incremental_state, "alstm_c")
+        if alstm_c is None:
+            alstm_c = enc_out.new_zeros(bsz, self.args.decoder_lstm_dim)
+
+        lstm_h = self.get_incremental_state(incremental_state, "lstm_h")
+        if lstm_h is None:
+            lstm_h = [
+                enc_out.new_zeros(bsz, self.args.decoder_lstm_dim)
+                for _ in range(self.args.decoder_lstm_layers)
+            ]
+        lstm_c = self.get_incremental_state(incremental_state, "lstm_c")
+        if lstm_c is None:
+            lstm_c = [
+                enc_out.new_zeros(bsz, self.args.decoder_lstm_dim)
+                for _ in range(self.args.decoder_lstm_layers)
+            ]
+
+        attn_w = self.get_incremental_state(incremental_state, "attn_w")
+        if attn_w is None:
+            attn_w = enc_out.new_zeros(bsz, in_len)
+        attn_w_cum = self.get_incremental_state(incremental_state, "attn_w_cum")
+        if attn_w_cum is None:
+            attn_w_cum = enc_out.new_zeros(bsz, in_len)
+        return alstm_h, alstm_c, lstm_h, lstm_c, attn_w, attn_w_cum
+
+    def _get_init_attn_c(self, enc_out, enc_mask):
+        bsz = enc_out.size(0)
+        if self.args.init_attn_c == "zero":
+            return enc_out.new_zeros(bsz, self.args.encoder_embed_dim)
+        elif self.args.init_attn_c == "avg":
+            enc_w = (~enc_mask).type(enc_out.type())
+            enc_w = enc_w / enc_w.sum(dim=1, keepdim=True)
+            return torch.sum(enc_out * enc_w.unsqueeze(2), dim=1)
+        else:
+            raise ValueError(f"{self.args.init_attn_c} not supported")
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        target_lengths=None,
+        **kwargs,
+    ):
+        enc_mask = encoder_out["encoder_padding_mask"]
+        enc_out = encoder_out["encoder_out"][0]
+        in_len = enc_out.size(1)
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:, :]
+        bsz, out_len, _ = prev_output_tokens.size()
+
+        prenet_out = self.prenet(prev_output_tokens)
+        (alstm_h, alstm_c, lstm_h, lstm_c, attn_w, attn_w_cum) = self._get_states(
+            incremental_state, enc_out
+        )
+        attn_ctx = self._get_init_attn_c(enc_out, enc_mask)
+
+        attn_out = enc_out.new_zeros(bsz, in_len, out_len)
+        feat_out = enc_out.new_zeros(bsz, out_len, self.out_dim)
+        eos_out = enc_out.new_zeros(bsz, out_len)
+        for t in range(out_len):
+            alstm_in = torch.cat((attn_ctx, prenet_out[:, t, :]), dim=1)
+            alstm_h, alstm_c = self.attention_lstm(alstm_in, (alstm_h, alstm_c))
+
+            attn_state = attn_w.unsqueeze(1)
+            if self.args.attention_use_cumprob:
+                attn_state = torch.stack((attn_w, attn_w_cum), dim=1)
+            attn_ctx, attn_w = self.attention(enc_out, enc_mask, alstm_h, attn_state)
+            attn_w_cum = attn_w_cum + attn_w
+            attn_out[:, :, t] = attn_w
+
+            for i, cur_lstm in enumerate(self.lstm):
+                if i == 0:
+                    lstm_in = torch.cat((attn_ctx, alstm_h), dim=1)
+                else:
+                    lstm_in = torch.cat((attn_ctx, lstm_h[i - 1]), dim=1)
+                lstm_h[i], lstm_c[i] = cur_lstm(lstm_in, (lstm_h[i], lstm_c[i]))
+
+            proj_in = torch.cat((attn_ctx, lstm_h[-1]), dim=1)
+            feat_out[:, t, :] = self.feat_proj(proj_in)
+            eos_out[:, t] = self.eos_proj(proj_in).squeeze(1)
+        self.attention.clear_cache()
+
+        self.set_incremental_state(incremental_state, "alstm_h", alstm_h)
+        self.set_incremental_state(incremental_state, "alstm_c", alstm_c)
+        self.set_incremental_state(incremental_state, "lstm_h", lstm_h)
+        self.set_incremental_state(incremental_state, "lstm_c", lstm_c)
+        self.set_incremental_state(incremental_state, "attn_w", attn_w)
+        self.set_incremental_state(incremental_state, "attn_w_cum", attn_w_cum)
+
+        post_feat_out = feat_out + self.postnet(feat_out)
+        eos_out = eos_out.view(bsz, out_len, 1)
+        return post_feat_out, eos_out, {"attn": attn_out, "feature_out": feat_out}
+
+
+@register_model("tacotron_2")
+class Tacotron2Model(FairseqEncoderDecoderModel):
+    """
+    Implementation for https://arxiv.org/pdf/1712.05884.pdf
+    """
+
+    @staticmethod
+    def add_args(parser):
+        # encoder
+        parser.add_argument("--encoder-dropout", type=float)
+        parser.add_argument("--encoder-embed-dim", type=int)
+        parser.add_argument("--encoder-conv-layers", type=int)
+        parser.add_argument("--encoder-conv-kernel-size", type=int)
+        parser.add_argument("--encoder-lstm-layers", type=int)
+        # decoder
+        parser.add_argument("--attention-dim", type=int)
+        parser.add_argument("--attention-conv-dim", type=int)
+        parser.add_argument("--attention-conv-kernel-size", type=int)
+        parser.add_argument("--prenet-dropout", type=float)
+        parser.add_argument("--prenet-layers", type=int)
+        parser.add_argument("--prenet-dim", type=int)
+        parser.add_argument("--postnet-dropout", type=float)
+        parser.add_argument("--postnet-layers", type=int)
+        parser.add_argument("--postnet-conv-dim", type=int)
+        parser.add_argument("--postnet-conv-kernel-size", type=int)
+        parser.add_argument("--init-attn-c", type=str)
+        parser.add_argument("--attention-use-cumprob", action="store_true")
+        parser.add_argument("--zoneout", type=float)
+        parser.add_argument("--decoder-lstm-layers", type=int)
+        parser.add_argument("--decoder-lstm-dim", type=int)
+        parser.add_argument("--output-frame-dim", type=int)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_updates = 0
+
+    @classmethod
+    def build_model(cls, args, task):
+        embed_speaker = task.get_speaker_embeddings(args)
+        encoder = Tacotron2Encoder(args, task.src_dict, embed_speaker)
+        decoder = Tacotron2Decoder(args, task.src_dict)
+        return cls(encoder, decoder)
+
+    def forward_encoder(self, src_tokens, src_lengths, **kwargs):
+        return self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self._num_updates = num_updates
+
+
+@register_model_architecture("tacotron_2", "tacotron_2")
+def base_architecture(args):
+    # encoder
+    args.encoder_dropout = getattr(args, "encoder_dropout", 0.5)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3)
+    args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5)
+    args.encoder_lstm_layers = getattr(args, "encoder_lstm_layers", 1)
+    # decoder
+    args.attention_dim = getattr(args, "attention_dim", 128)
+    args.attention_conv_dim = getattr(args, "attention_conv_dim", 32)
+    args.attention_conv_kernel_size = getattr(args, "attention_conv_kernel_size", 15)
+    args.prenet_dropout = getattr(args, "prenet_dropout", 0.5)
+    args.prenet_layers = getattr(args, "prenet_layers", 2)
+    args.prenet_dim = getattr(args, "prenet_dim", 256)
+    args.postnet_dropout = getattr(args, "postnet_dropout", 0.5)
+    args.postnet_layers = getattr(args, "postnet_layers", 5)
+    args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512)
+    args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5)
+    args.init_attn_c = getattr(args, "init_attn_c", "zero")
+    args.attention_use_cumprob = getattr(args, "attention_use_cumprob", True)
+    args.zoneout = getattr(args, "zoneout", 0.1)
+    args.decoder_lstm_layers = getattr(args, "decoder_lstm_layers", 2)
+    args.decoder_lstm_dim = getattr(args, "decoder_lstm_dim", 1024)
+    args.output_frame_dim = getattr(args, "output_frame_dim", 80)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
new file mode 100644
index 0000000000..19afc2b717
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
@@ -0,0 +1,454 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Optional
+
+import torch
+from torch import nn
+
+from fairseq import utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+from fairseq.models.text_to_speech.tacotron2 import Postnet, Prenet
+from fairseq.modules import (
+    FairseqDropout,
+    LayerNorm,
+    PositionalEmbedding,
+    TransformerDecoderLayer,
+    TransformerEncoderLayer,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def encoder_init(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu"))
+
+
+def Embedding(num_embeddings, embedding_dim):
+    m = nn.Embedding(num_embeddings, embedding_dim)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    return m
+
+
+class TTSTransformerEncoder(FairseqEncoder):
+    def __init__(self, args, src_dict, embed_speaker):
+        super().__init__(src_dict)
+        self.padding_idx = src_dict.pad()
+        self.embed_speaker = embed_speaker
+        self.spk_emb_proj = None
+        if embed_speaker is not None:
+            self.spk_emb_proj = nn.Linear(
+                args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim
+            )
+
+        self.dropout_module = FairseqDropout(
+            p=args.dropout, module_name=self.__class__.__name__
+        )
+        self.embed_tokens = nn.Embedding(
+            len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx
+        )
+        assert args.encoder_conv_kernel_size % 2 == 1
+        self.prenet = nn.ModuleList(
+            nn.Sequential(
+                nn.Conv1d(
+                    args.encoder_embed_dim,
+                    args.encoder_embed_dim,
+                    kernel_size=args.encoder_conv_kernel_size,
+                    padding=((args.encoder_conv_kernel_size - 1) // 2),
+                ),
+                nn.BatchNorm1d(args.encoder_embed_dim),
+                nn.ReLU(),
+                nn.Dropout(args.encoder_dropout),
+            )
+            for _ in range(args.encoder_conv_layers)
+        )
+        self.prenet_proj = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim)
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions, args.encoder_embed_dim, self.padding_idx
+        )
+        self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+
+        self.transformer_layers = nn.ModuleList(
+            TransformerEncoderLayer(args)
+            for _ in range(args.encoder_transformer_layers)
+        )
+        if args.encoder_normalize_before:
+            self.layer_norm = LayerNorm(args.encoder_embed_dim)
+        else:
+            self.layer_norm = None
+
+        self.apply(encoder_init)
+
+    def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs):
+        x = self.embed_tokens(src_tokens)
+        x = x.transpose(1, 2).contiguous()  # B x T x C -> B x C x T
+        for conv in self.prenet:
+            x = conv(x)
+        x = x.transpose(1, 2).contiguous()  # B x C x T -> B x T x C
+        x = self.prenet_proj(x)
+
+        padding_mask = src_tokens.eq(self.padding_idx)
+        positions = self.embed_positions(padding_mask)
+        x += self.pos_emb_alpha * positions
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        for layer in self.transformer_layers:
+            x = layer(x, padding_mask)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        if self.embed_speaker is not None:
+            seq_len, bsz, _ = x.size()
+            emb = self.embed_speaker(speaker).transpose(0, 1)
+            emb = emb.expand(seq_len, bsz, -1)
+            x = self.spk_emb_proj(torch.cat([x, emb], dim=2))
+
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [padding_mask]
+            if padding_mask.any()
+            else [],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": [],  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+
+
+def decoder_init(m):
+    if isinstance(m, torch.nn.Conv1d):
+        nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh"))
+
+
+class TTSTransformerDecoder(FairseqIncrementalDecoder):
+    def __init__(self, args, src_dict, padding_idx=1):
+        super().__init__(None)
+        self._future_mask = torch.empty(0)
+
+        self.args = args
+        self.padding_idx = src_dict.pad() if src_dict else padding_idx
+        self.n_frames_per_step = args.n_frames_per_step
+        self.out_dim = args.output_frame_dim * args.n_frames_per_step
+
+        self.dropout_module = FairseqDropout(
+            args.dropout, module_name=self.__class__.__name__
+        )
+        self.embed_positions = PositionalEmbedding(
+            args.max_target_positions, args.decoder_embed_dim, self.padding_idx
+        )
+        self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+        self.prenet = nn.Sequential(
+            Prenet(
+                self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout
+            ),
+            nn.Linear(args.prenet_dim, args.decoder_embed_dim),
+        )
+
+        self.n_transformer_layers = args.decoder_transformer_layers
+        self.transformer_layers = nn.ModuleList(
+            TransformerDecoderLayer(args) for _ in range(self.n_transformer_layers)
+        )
+        if args.decoder_normalize_before:
+            self.layer_norm = LayerNorm(args.decoder_embed_dim)
+        else:
+            self.layer_norm = None
+
+        self.feat_proj = nn.Linear(args.decoder_embed_dim, self.out_dim)
+        self.eos_proj = nn.Linear(args.decoder_embed_dim, 1)
+
+        self.postnet = Postnet(
+            self.out_dim,
+            args.postnet_conv_dim,
+            args.postnet_conv_kernel_size,
+            args.postnet_layers,
+            args.postnet_dropout,
+        )
+
+        self.ctc_proj = None
+        if getattr(args, "ctc_weight", 0.0) > 0.0:
+            self.ctc_proj = nn.Linear(self.out_dim, len(src_dict))
+
+        self.apply(decoder_init)
+
+    def extract_features(
+        self,
+        prev_outputs,
+        encoder_out=None,
+        incremental_state=None,
+        target_lengths=None,
+        speaker=None,
+        **kwargs,
+    ):
+        alignment_layer = self.n_transformer_layers - 1
+        self_attn_padding_mask = lengths_to_padding_mask(target_lengths)
+        positions = self.embed_positions(
+            self_attn_padding_mask, incremental_state=incremental_state
+        )
+
+        if incremental_state is not None:
+            prev_outputs = prev_outputs[:, -1:, :]
+            self_attn_padding_mask = self_attn_padding_mask[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        x = self.prenet(prev_outputs)
+        x += self.pos_emb_alpha * positions
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        if not self_attn_padding_mask.any():
+            self_attn_padding_mask = None
+
+        attn: Optional[torch.Tensor] = None
+        inner_states: List[Optional[torch.Tensor]] = [x]
+        for idx, transformer_layer in enumerate(self.transformer_layers):
+            if incremental_state is None:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn, _ = transformer_layer(
+                x,
+                encoder_out["encoder_out"][0]
+                if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0)
+                else None,
+                encoder_out["encoder_padding_mask"][0]
+                if (
+                    encoder_out is not None
+                    and len(encoder_out["encoder_padding_mask"]) > 0
+                )
+                else None,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+
+        if attn is not None:
+            # average probabilities over heads, transpose to
+            # (B, src_len, tgt_len)
+            attn = attn.mean(dim=0).transpose(2, 1)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        target_lengths=None,
+        speaker=None,
+        **kwargs,
+    ):
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            target_lengths=target_lengths,
+            speaker=speaker,
+            **kwargs,
+        )
+        attn = extra["attn"]
+        feat_out = self.feat_proj(x)
+        bsz, seq_len, _ = x.size()
+        eos_out = self.eos_proj(x)
+        post_feat_out = feat_out + self.postnet(feat_out)
+        return (
+            post_feat_out,
+            eos_out,
+            {
+                "attn": attn,
+                "feature_out": feat_out,
+                "inner_states": extra["inner_states"],
+            },
+        )
+
+    def get_normalized_probs(self, net_output, log_probs, sample):
+        logits = self.ctc_proj(net_output[2]["feature_out"])
+        if log_probs:
+            return utils.log_softmax(logits.float(), dim=-1)
+        else:
+            return utils.softmax(logits.float(), dim=-1)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
+        if (
+            self._future_mask.size(0) == 0
+            or (not self._future_mask.device == tensor.device)
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
+            )
+        self._future_mask = self._future_mask.to(tensor)
+        return self._future_mask[:dim, :dim]
+
+
+@register_model("tts_transformer")
+class TTSTransformerModel(FairseqEncoderDecoderModel):
+    """
+    Implementation for https://arxiv.org/pdf/1809.08895.pdf
+    """
+
+    @classmethod
+    def hub_models(cls):
+        base_url = "http://dl.fbaipublicfiles.com/fairseq/s2"
+        model_ids = [
+            "tts_transformer-en-ljspeech",
+            "tts_transformer-en-200_speaker-cv4",
+            "tts_transformer-es-css10",
+            "tts_transformer-fr-cv7_css10",
+            "tts_transformer-ru-cv7_css10",
+            "tts_transformer-zh-cv7_css10",
+            "tts_transformer-ar-cv7_css10",
+            "tts_transformer-tr-cv7_css10",
+            "tts_transformer-vi-cv7",
+        ]
+        return {i: f"{base_url}/{i}.tar.gz" for i in model_ids}
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        config_yaml="config.yaml",
+        vocoder: str = "griffin_lim",
+        fp16: bool = False,
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            config_yaml=config_yaml,
+            vocoder=vocoder,
+            fp16=fp16,
+            **kwargs,
+        )
+        return TTSHubInterface(x["args"], x["task"], x["models"][0])
+
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument("--dropout", type=float)
+        parser.add_argument("--output-frame-dim", type=int)
+        parser.add_argument("--speaker-embed-dim", type=int)
+        # encoder prenet
+        parser.add_argument("--encoder-dropout", type=float)
+        parser.add_argument("--encoder-conv-layers", type=int)
+        parser.add_argument("--encoder-conv-kernel-size", type=int)
+        # encoder transformer layers
+        parser.add_argument("--encoder-transformer-layers", type=int)
+        parser.add_argument("--encoder-embed-dim", type=int)
+        parser.add_argument("--encoder-ffn-embed-dim", type=int)
+        parser.add_argument("--encoder-normalize-before", action="store_true")
+        parser.add_argument("--encoder-attention-heads", type=int)
+        parser.add_argument("--attention-dropout", type=float)
+        parser.add_argument("--activation-dropout", "--relu-dropout", type=float)
+        parser.add_argument("--activation-fn", type=str, default="relu")
+        # decoder prenet
+        parser.add_argument("--prenet-dropout", type=float)
+        parser.add_argument("--prenet-layers", type=int)
+        parser.add_argument("--prenet-dim", type=int)
+        # decoder postnet
+        parser.add_argument("--postnet-dropout", type=float)
+        parser.add_argument("--postnet-layers", type=int)
+        parser.add_argument("--postnet-conv-dim", type=int)
+        parser.add_argument("--postnet-conv-kernel-size", type=int)
+        # decoder transformer layers
+        parser.add_argument("--decoder-transformer-layers", type=int)
+        parser.add_argument("--decoder-embed-dim", type=int)
+        parser.add_argument("--decoder-ffn-embed-dim", type=int)
+        parser.add_argument("--decoder-normalize-before", action="store_true")
+        parser.add_argument("--decoder-attention-heads", type=int)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._num_updates = 0
+
+    @classmethod
+    def build_model(cls, args, task):
+        embed_speaker = task.get_speaker_embeddings(args)
+        encoder = TTSTransformerEncoder(args, task.src_dict, embed_speaker)
+        decoder = TTSTransformerDecoder(args, task.src_dict)
+        return cls(encoder, decoder)
+
+    def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs):
+        return self.encoder(
+            src_tokens, src_lengths=src_lengths, speaker=speaker, **kwargs
+        )
+
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        self._num_updates = num_updates
+
+
+@register_model_architecture("tts_transformer", "tts_transformer")
+def base_architecture(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.output_frame_dim = getattr(args, "output_frame_dim", 80)
+    args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64)
+    # encoder prenet
+    args.encoder_dropout = getattr(args, "encoder_dropout", 0.5)
+    args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3)
+    args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5)
+    # encoder transformer layers
+    args.encoder_transformer_layers = getattr(args, "encoder_transformer_layers", 6)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(
+        args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim
+    )
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    # decoder prenet
+    args.prenet_dropout = getattr(args, "prenet_dropout", 0.5)
+    args.prenet_layers = getattr(args, "prenet_layers", 2)
+    args.prenet_dim = getattr(args, "prenet_dim", 256)
+    # decoder postnet
+    args.postnet_dropout = getattr(args, "postnet_dropout", 0.5)
+    args.postnet_layers = getattr(args, "postnet_layers", 5)
+    args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512)
+    args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5)
+    # decoder transformer layers
+    args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim
+    )
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
new file mode 100644
index 0000000000..dbc02da368
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
@@ -0,0 +1,305 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from fairseq.data.audio.audio_utils import (
+    TTSSpectrogram,
+    get_fourier_basis,
+    get_mel_filters,
+    get_window,
+)
+from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.text_to_speech.codehifigan import CodeGenerator as CodeHiFiGANModel
+from fairseq.models.text_to_speech.hifigan import Generator as HiFiGANModel
+from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
+
+logger = logging.getLogger(__name__)
+
+
+class PseudoInverseMelScale(torch.nn.Module):
+    def __init__(self, n_stft, n_mels, sample_rate, f_min, f_max) -> None:
+        super(PseudoInverseMelScale, self).__init__()
+        self.n_mels = n_mels
+        basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max)
+        basis = torch.pinverse(basis)  # F x F_mel
+        self.register_buffer("basis", basis)
+
+    def forward(self, melspec: torch.Tensor) -> torch.Tensor:
+        # pack batch
+        shape = melspec.shape  # B_1 x ... x B_K x F_mel x T
+        n_mels, time = shape[-2], shape[-1]
+        melspec = melspec.view(-1, n_mels, time)
+
+        freq, _ = self.basis.size()  # F x F_mel
+        assert self.n_mels == n_mels, (self.n_mels, n_mels)
+        specgram = self.basis.matmul(melspec).clamp(min=0)
+
+        # unpack batch
+        specgram = specgram.view(shape[:-2] + (freq, time))
+        return specgram
+
+
+class GriffinLim(torch.nn.Module):
+    def __init__(
+        self,
+        n_fft: int,
+        win_length: int,
+        hop_length: int,
+        n_iter: int,
+        window_fn=torch.hann_window,
+    ):
+        super(GriffinLim, self).__init__()
+        self.transform = TTSSpectrogram(
+            n_fft, win_length, hop_length, return_phase=True
+        )
+
+        basis = get_fourier_basis(n_fft)
+        basis = torch.pinverse(n_fft / hop_length * basis).T[:, None, :]
+        basis *= get_window(window_fn, n_fft, win_length)
+        self.register_buffer("basis", basis)
+
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.n_iter = n_iter
+
+        self.tiny = 1.1754944e-38
+
+    @classmethod
+    def get_window_sum_square(
+        cls, n_frames, hop_length, win_length, n_fft, window_fn=torch.hann_window
+    ) -> torch.Tensor:
+        w_sq = get_window(window_fn, n_fft, win_length) ** 2
+        n = n_fft + hop_length * (n_frames - 1)
+        x = torch.zeros(n, dtype=torch.float32)
+        for i in range(n_frames):
+            ofst = i * hop_length
+            x[ofst : min(n, ofst + n_fft)] += w_sq[: max(0, min(n_fft, n - ofst))]
+        return x
+
+    def inverse(self, magnitude: torch.Tensor, phase) -> torch.Tensor:
+        x = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        x = F.conv_transpose1d(x, self.basis, stride=self.hop_length)
+        win_sum_sq = self.get_window_sum_square(
+            magnitude.shape[-1],
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            n_fft=self.n_fft,
+        ).to(magnitude.device)
+        # remove modulation effects
+        approx_nonzero_indices = win_sum_sq > self.tiny
+        x[:, :, approx_nonzero_indices] /= win_sum_sq[approx_nonzero_indices]
+        x *= self.n_fft / self.hop_length
+        x = x[:, :, self.n_fft // 2 :]
+        x = x[:, :, : -self.n_fft // 2 :]
+        return x
+
+    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
+        angles = np.angle(np.exp(2j * np.pi * np.random.rand(*specgram.shape)))
+        angles = torch.from_numpy(angles).to(specgram)
+        _specgram = specgram.view(-1, specgram.shape[-2], specgram.shape[-1])
+        waveform = self.inverse(_specgram, angles).squeeze(1)
+        for _ in range(self.n_iter):
+            _, angles = self.transform(waveform)
+            waveform = self.inverse(_specgram, angles).squeeze(1)
+        return waveform.squeeze(0)
+
+
+class GriffinLimVocoder(nn.Module):
+    def __init__(
+        self,
+        sample_rate,
+        win_size,
+        hop_size,
+        n_fft,
+        n_mels,
+        f_min,
+        f_max,
+        window_fn,
+        spec_bwd_max_iter=32,
+        fp16=False,
+    ):
+        super().__init__()
+        self.inv_mel_transform = PseudoInverseMelScale(
+            n_stft=n_fft // 2 + 1,
+            n_mels=n_mels,
+            sample_rate=sample_rate,
+            f_min=f_min,
+            f_max=f_max,
+        )
+        self.gl_transform = GriffinLim(
+            n_fft=n_fft,
+            win_length=win_size,
+            hop_length=hop_size,
+            window_fn=window_fn,
+            n_iter=spec_bwd_max_iter,
+        )
+        if fp16:
+            self.half()
+            self.inv_mel_transform.half()
+            self.gl_transform.half()
+        else:
+            self.float()
+            self.inv_mel_transform.float()
+            self.gl_transform.float()
+
+    def forward(self, x):
+        # x: (B x) T x D -> (B x) 1 x T
+        # NOTE: batched forward produces noisier waveform. recommend running
+        # one utterance at a time
+        self.eval()
+        x = x.exp().transpose(-1, -2)
+        x = self.inv_mel_transform(x)
+        x = self.gl_transform(x)
+        return x
+
+    @classmethod
+    def from_data_cfg(cls, args, data_cfg: S2TDataConfig):
+        feat_cfg = data_cfg.config["features"]
+        window_fn = getattr(torch, feat_cfg["window_fn"] + "_window")
+        return cls(
+            sample_rate=feat_cfg["sample_rate"],
+            win_size=int(feat_cfg["win_len_t"] * feat_cfg["sample_rate"]),
+            hop_size=int(feat_cfg["hop_len_t"] * feat_cfg["sample_rate"]),
+            n_fft=feat_cfg["n_fft"],
+            n_mels=feat_cfg["n_mels"],
+            f_min=feat_cfg["f_min"],
+            f_max=feat_cfg["f_max"],
+            window_fn=window_fn,
+            spec_bwd_max_iter=args.spec_bwd_max_iter,
+            fp16=args.fp16,
+        )
+
+
+class HiFiGANVocoder(nn.Module):
+    def __init__(
+        self, checkpoint_path: str, model_cfg: Dict[str, str], fp16: bool = False
+    ) -> None:
+        super().__init__()
+        self.model = HiFiGANModel(model_cfg)
+        state_dict = torch.load(checkpoint_path)
+        self.model.load_state_dict(state_dict["generator"])
+        if fp16:
+            self.model.half()
+        logger.info(f"loaded HiFiGAN checkpoint from {checkpoint_path}")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # (B x) T x D -> (B x) 1 x T
+        model = self.model.eval()
+        if len(x.shape) == 2:
+            return model(x.unsqueeze(0).transpose(1, 2)).detach().squeeze(0)
+        else:
+            return model(x.transpose(-1, -2)).detach()
+
+    @classmethod
+    def from_data_cfg(cls, args, data_cfg: S2TDataConfig):
+        vocoder_cfg = data_cfg.vocoder
+        assert vocoder_cfg.get("type", "griffin_lim") == "hifigan"
+        with open(vocoder_cfg["config"]) as f:
+            model_cfg = json.load(f)
+        return cls(vocoder_cfg["checkpoint"], model_cfg, fp16=args.fp16)
+
+
+@register_model("CodeHiFiGANVocoder")
+class CodeHiFiGANVocoder(BaseFairseqModel):
+    def __init__(
+        self, checkpoint_path: str, model_cfg: Dict[str, str], fp16: bool = False
+    ) -> None:
+        super().__init__()
+        self.model = CodeHiFiGANModel(model_cfg)
+        if torch.cuda.is_available():
+            state_dict = torch.load(checkpoint_path)
+        else:
+            state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        self.model.load_state_dict(state_dict["generator"])
+        self.model.eval()
+        if fp16:
+            self.model.half()
+        self.model.remove_weight_norm()
+        logger.info(f"loaded CodeHiFiGAN checkpoint from {checkpoint_path}")
+
+    def forward(self, x: Dict[str, torch.Tensor], dur_prediction=False) -> torch.Tensor:
+        assert "code" in x
+        x["dur_prediction"] = dur_prediction
+
+        # remove invalid code
+        mask = x["code"] >= 0
+        x["code"] = x["code"][mask].unsqueeze(dim=0)
+        if "f0" in x:
+            f0_up_ratio = x["f0"].size(1) // x["code"].size(1)
+            mask = mask.unsqueeze(2).repeat(1, 1, f0_up_ratio).view(-1, x["f0"].size(1))
+            x["f0"] = x["f0"][mask].unsqueeze(dim=0)
+
+        return self.model(**x).detach().squeeze()
+
+    @classmethod
+    def from_data_cfg(cls, args, data_cfg):
+        vocoder_cfg = data_cfg.vocoder
+        assert vocoder_cfg is not None, "vocoder not specified in the data config"
+        with open(vocoder_cfg["config"]) as f:
+            model_cfg = json.load(f)
+        return cls(vocoder_cfg["checkpoint"], model_cfg, fp16=args.fp16)
+
+    @classmethod
+    def hub_models(cls):
+        base_url = "http://dl.fbaipublicfiles.com/fairseq/vocoder"
+        model_ids = [
+            "unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur",
+            "unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10_dur",
+            "unit_hifigan_HK_layer12.km2500_frame_TAT-TTS",
+        ]
+        return {i: f"{base_url}/{i}.tar.gz" for i in model_ids}
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        config="config.json",
+        fp16: bool = False,
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            config_yaml=config,
+            fp16=fp16,
+            is_vocoder=True,
+            **kwargs,
+        )
+
+        with open(f"{x['args']['data']}/{config}") as f:
+            vocoder_cfg = json.load(f)
+        assert len(x["args"]["model_path"]) == 1, "Too many vocoder models in the input"
+
+        vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
+        return VocoderHubInterface(vocoder_cfg, vocoder)
+
+
+def get_vocoder(args, data_cfg: S2TDataConfig):
+    if args.vocoder == "griffin_lim":
+        return GriffinLimVocoder.from_data_cfg(args, data_cfg)
+    elif args.vocoder == "hifigan":
+        return HiFiGANVocoder.from_data_cfg(args, data_cfg)
+    elif args.vocoder == "code_hifigan":
+        return CodeHiFiGANVocoder.from_data_cfg(args, data_cfg)
+    else:
+        raise ValueError("Unknown vocoder")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py
new file mode 100644
index 0000000000..681fca3d45
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+from .transformer_config import (
+    TransformerConfig,
+    DEFAULT_MAX_SOURCE_POSITIONS,
+    DEFAULT_MAX_TARGET_POSITIONS,
+    DEFAULT_MIN_PARAMS_TO_WRAP,
+)
+from .transformer_decoder import TransformerDecoder, TransformerDecoderBase, Linear
+from .transformer_encoder import TransformerEncoder, TransformerEncoderBase
+from .transformer_legacy import (
+    TransformerModel,
+    base_architecture,
+    tiny_architecture,
+    transformer_iwslt_de_en,
+    transformer_wmt_en_de,
+    transformer_vaswani_wmt_en_de_big,
+    transformer_vaswani_wmt_en_fr_big,
+    transformer_wmt_en_de_big,
+    transformer_wmt_en_de_big_t2t,
+)
+from .transformer_base import TransformerModelBase, Embedding
+
+
+__all__ = [
+    "TransformerModelBase",
+    "TransformerConfig",
+    "TransformerDecoder",
+    "TransformerDecoderBase",
+    "TransformerEncoder",
+    "TransformerEncoderBase",
+    "TransformerModel",
+    "Embedding",
+    "Linear",
+    "base_architecture",
+    "tiny_architecture",
+    "transformer_iwslt_de_en",
+    "transformer_wmt_en_de",
+    "transformer_vaswani_wmt_en_de_big",
+    "transformer_vaswani_wmt_en_fr_big",
+    "transformer_wmt_en_de_big",
+    "transformer_wmt_en_de_big_t2t",
+    "DEFAULT_MAX_SOURCE_POSITIONS",
+    "DEFAULT_MAX_TARGET_POSITIONS",
+    "DEFAULT_MIN_PARAMS_TO_WRAP",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
new file mode 100644
index 0000000000..f9f097f04b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+import logging
+
+from fairseq import utils
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.distributed import fsdp_wrap
+from fairseq.models import FairseqEncoderDecoderModel
+from fairseq.models.transformer import (
+    TransformerConfig,
+    TransformerDecoderBase,
+    TransformerEncoderBase,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class TransformerModelBase(FairseqEncoderDecoderModel):
+    """
+    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
+    <https://arxiv.org/abs/1706.03762>`_.
+
+    Args:
+        encoder (TransformerEncoder): the encoder
+        decoder (TransformerDecoder): the decoder
+
+    The Transformer model provides the following named architectures and
+    command-line arguments:
+
+    .. argparse::
+        :ref: fairseq.models.transformer_parser
+        :prog:
+    """
+
+    def __init__(self, cfg, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.cfg = cfg
+        self.supports_align_args = True
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add model-specific arguments to the parser."""
+        # we want to build the args recursively in this case.
+        gen_parser_from_dataclass(
+            parser, TransformerConfig(), delete_default=False, with_prefix=""
+        )
+
+    @classmethod
+    def build_model(cls, cfg, task):
+        """Build a new model instance."""
+
+        # --  TODO T96535332
+        #  bug caused by interaction between OmegaConf II and argparsing
+        cfg.decoder.input_dim = int(cfg.decoder.input_dim)
+        cfg.decoder.output_dim = int(cfg.decoder.output_dim)
+        # --
+
+        if cfg.encoder.layers_to_keep:
+            cfg.encoder.layers = len(cfg.encoder.layers_to_keep.split(","))
+        if cfg.decoder.layers_to_keep:
+            cfg.decoder.layers = len(cfg.decoder.layers_to_keep.split(","))
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        if cfg.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise ValueError("--share-all-embeddings requires a joined dictionary")
+            if cfg.encoder.embed_dim != cfg.decoder.embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if cfg.decoder.embed_path and (
+                cfg.decoder.embed_path != cfg.encoder.embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = cls.build_embedding(
+                cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            cfg.share_decoder_input_output_embed = True
+        elif cfg.merge_src_tgt_embed:
+            logger.info(f"source dict size: {len(src_dict)}")
+            logger.info(f"target dict size: {len(tgt_dict)}")
+            src_dict.update(tgt_dict)
+            task.src_dict = src_dict
+            task.tgt_dict = src_dict
+            logger.info(f"merged dict size: {len(src_dict)}")
+            encoder_embed_tokens = cls.build_embedding(
+                cfg, src_dict, cfg.encoder.embed_dim
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            cfg.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = cls.build_embedding(
+                cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path
+            )
+            decoder_embed_tokens = cls.build_embedding(
+                cfg, tgt_dict, cfg.decoder.embed_dim, cfg.decoder.embed_path
+            )
+        if cfg.offload_activations:
+            cfg.checkpoint_activations = True  # offloading implies checkpointing
+        encoder = cls.build_encoder(cfg, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens)
+        return cls(cfg, encoder, decoder)
+
+    @classmethod
+    def build_embedding(cls, cfg, dictionary, embed_dim, path=None):
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+
+        emb = Embedding(num_embeddings, embed_dim, padding_idx)
+        # if provided, load from preloaded dictionaries
+        if path:
+            embed_dict = utils.parse_embedding(path)
+            utils.load_embedding(embed_dict, dictionary, emb)
+        return emb
+
+    @classmethod
+    def build_encoder(cls, cfg, src_dict, embed_tokens):
+        return TransformerEncoderBase(cfg, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, cfg, tgt_dict, embed_tokens):
+        return TransformerDecoderBase(
+            cfg,
+            tgt_dict,
+            embed_tokens,
+            no_encoder_attn=cfg.no_cross_attention,
+        )
+
+    # TorchScript doesn't support optional arguments with variable length (**kwargs).
+    # Current workaround is to add union of all arguments in child classes.
+    def forward(
+        self,
+        src_tokens,
+        src_lengths,
+        prev_output_tokens,
+        return_all_hiddens: bool = True,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Run the forward pass for an encoder-decoder model.
+
+        Copied from the base class, but without ``**kwargs``,
+        which are not supported by TorchScript.
+        """
+        encoder_out = self.encoder(
+            src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens
+        )
+        decoder_out = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+            src_lengths=src_lengths,
+            return_all_hiddens=return_all_hiddens,
+        )
+        return decoder_out
+
+    # Since get_normalized_probs is in the Fairseq Model which is not scriptable,
+    # I rewrite the get_normalized_probs from Base Class to call the
+    # helper function in the Base Class.
+    @torch.jit.export
+    def get_normalized_probs(
+        self,
+        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+        log_probs: bool,
+        sample: Optional[Dict[str, Tensor]] = None,
+    ):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
new file mode 100644
index 0000000000..4650de2e17
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
@@ -0,0 +1,341 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import re
+from dataclasses import dataclass, field, fields
+from typing import List, Optional
+
+from omegaconf import II
+
+from fairseq import utils
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.utils import safe_getattr, safe_hasattr
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8)
+
+_NAME_PARSER = r"(decoder|encoder|quant_noise)_(.*)"
+
+
+@dataclass
+class EncDecBaseConfig(FairseqDataclass):
+    embed_path: Optional[str] = field(
+        default=None, metadata={"help": "path to pre-trained embedding"}
+    )
+    embed_dim: Optional[int] = field(
+        default=512, metadata={"help": "embedding dimension"}
+    )
+    ffn_embed_dim: int = field(
+        default=2048, metadata={"help": "embedding dimension for FFN"}
+    )
+    layers: int = field(default=6, metadata={"help": "number of layers"})
+    attention_heads: int = field(
+        default=8, metadata={"help": "number of attention heads"}
+    )
+    normalize_before: bool = field(
+        default=False, metadata={"help": "apply layernorm before each block"}
+    )
+    learned_pos: bool = field(
+        default=False, metadata={"help": "use learned positional embeddings"}
+    )
+    # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+    layerdrop: float = field(default=0, metadata={"help": "LayerDrop probability"})
+    layers_to_keep: Optional[List[int]] = field(
+        default=None, metadata={"help": "which layers to *keep* when pruning"}
+    )
+
+    xformers_att_config: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "config for xFormers attention, defined in xformers.components.attention.AttentionConfig"
+        },
+    )
+
+
+@dataclass
+class DecoderConfig(EncDecBaseConfig):
+    input_dim: int = II("model.decoder.embed_dim")
+    output_dim: int = field(
+        default=II("model.decoder.embed_dim"),
+        metadata={
+            "help": "decoder output dimension (extra linear layer if different from decoder embed dim)"
+        },
+    )
+
+    def __post_init__(self):
+        #  II doesn't work if we are just creating the object outside of hydra so fix that
+        if self.input_dim == II("model.decoder.embed_dim"):
+            self.input_dim = self.embed_dim
+        if self.output_dim == II("model.decoder.embed_dim"):
+            self.output_dim = self.embed_dim
+
+
+@dataclass
+class QuantNoiseConfig(FairseqDataclass):
+    pq: float = field(
+        default=0.0,
+        metadata={"help": "iterative PQ quantization noise at training time"},
+    )
+    pq_block_size: int = field(
+        default=8,
+        metadata={"help": "block size of quantization noise at training time"},
+    )
+    scalar: float = field(
+        default=0.0,
+        metadata={
+            "help": "scalar quantization noise and scalar quantization at training time"
+        },
+    )
+
+
+@dataclass
+class TransformerConfig(FairseqDataclass):
+    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
+        default="relu",
+        metadata={"help": "activation function to use"},
+    )
+    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability for attention weights"}
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN.",
+            "alias": "--relu-dropout",
+        },
+    )
+    adaptive_input: bool = False
+    encoder: EncDecBaseConfig = EncDecBaseConfig()
+    # TODO should really be in the encoder config
+    max_source_positions: int = field(
+        default=DEFAULT_MAX_SOURCE_POSITIONS,
+        metadata={"help": "Maximum input length supported by the encoder"},
+    )
+    decoder: DecoderConfig = DecoderConfig()
+    # TODO should really be in the decoder config
+    max_target_positions: int = field(
+        default=DEFAULT_MAX_TARGET_POSITIONS,
+        metadata={"help": "Maximum output length supported by the decoder"},
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False, metadata={"help": "share decoder input and output embeddings"}
+    )
+    share_all_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "share encoder, decoder and output embeddings (requires shared dictionary and embed dim)"
+        },
+    )
+    merge_src_tgt_embed: bool = field(
+        default=False,
+        metadata={
+            "help": "if true then the source and target embedding table is "
+            "merged into one table. This is going to make the model smaller but "
+            "it might hurt performance."
+        },
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if True, disables positional embeddings (outside self attention)"
+        },
+    )
+    adaptive_softmax_cutoff: Optional[List[int]] = field(
+        default=None,
+        metadata={
+            "help": "list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion"
+        },
+    )
+    adaptive_softmax_dropout: float = field(
+        default=0.0,
+        metadata={"help": "sets adaptive softmax dropout for the tail projections"},
+    )
+    adaptive_softmax_factor: float = field(
+        default=4, metadata={"help": "adaptive input factor"}
+    )
+    layernorm_embedding: bool = field(
+        default=False, metadata={"help": "add layernorm to embedding"}
+    )
+    tie_adaptive_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, ties the weights of adaptive softmax and adaptive input"
+        },
+    )
+    tie_adaptive_proj: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, ties the projection weights of adaptive softmax and adaptive input"
+        },
+    )
+    no_scale_embedding: bool = field(
+        default=False, metadata={"help": "if True, dont scale embeddings"}
+    )
+    checkpoint_activations: bool = field(
+        default=False,
+        metadata={
+            "help": "checkpoint activations at each layer, which saves GPU memory usage at the cost of some additional compute"
+        },
+    )
+    offload_activations: bool = field(
+        default=False,
+        metadata={
+            "help": "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations."
+        },
+    )
+    # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
+    no_cross_attention: bool = field(
+        default=False, metadata={"help": "do not perform cross-attention"}
+    )
+    cross_self_attention: bool = field(
+        default=False, metadata={"help": "perform cross+self-attention"}
+    )
+    # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+    quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig())
+    min_params_to_wrap: int = field(
+        default=DEFAULT_MIN_PARAMS_TO_WRAP,
+        metadata={
+            "help": "minimum number of params for a layer to be wrapped with FSDP() when "
+            "training with --ddp-backend=fully_sharded. Smaller values will "
+            "improve memory efficiency, but may make torch.distributed "
+            "communication less efficient due to smaller input sizes. This option "
+            "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
+            "--offload-activations are passed."
+        },
+    )
+    # DEPRECATED field, but some old checkpoints might have it
+    char_inputs: bool = field(
+        default=False, metadata={"help": "if set, model takes character ids as input"}
+    )
+    relu_dropout: float = 0.0
+    # config for "BASE Layers: Simplifying Training of Large, Sparse Models"
+    base_layers: Optional[int] = field(
+        default=0, metadata={"help": "number of BASE layers in total"}
+    )
+    base_sublayers: Optional[int] = field(
+        default=1, metadata={"help": "number of sublayers in each BASE layer"}
+    )
+    base_shuffle: Optional[int] = field(
+        default=1,
+        metadata={"help": "shuffle tokens between workers before computing assignment"},
+    )
+
+    export: bool = field(
+        default=False,
+        metadata={"help": "make the layernorm exportable with torchscript."},
+    )
+
+    # copied from transformer_lm but expected in transformer_decoder:
+    no_decoder_final_norm: bool = field(
+        default=False,
+        metadata={"help": "don't add an extra layernorm after the last decoder block"},
+    )
+
+    # We need to make this hierarchical dataclass like the flat namespace
+    # __getattr__ and __setattr__ here allow backward compatibility
+    # for subclasses of Transformer(Legacy) that depend on read/write on
+    # the flat namespace.
+
+    def __getattr__(self, name):
+        match = re.match(_NAME_PARSER, name)
+        if match:
+            sub = safe_getattr(self, match[1])
+            return safe_getattr(sub, match[2])
+        raise AttributeError(f"invalid argument {name}.")
+
+    def __setattr__(self, name, value):
+        match = re.match(_NAME_PARSER, name)
+        if match:
+            sub = safe_getattr(self, match[1])
+            setattr(sub, match[2], value)
+        else:
+            super().__setattr__(name, value)
+
+    @staticmethod
+    def _copy_keys(args, cls, prefix, seen):
+        """
+        copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim
+        """
+        cfg = cls()
+        for fld in fields(cls):
+            # for all the fields in the DC, find the fields (e.g. embed_dim)
+            # in the namespace with the prefix (e.g. decoder)
+            # and set it on the dc.
+            args_key = f"{prefix}_{fld.name}"
+            if safe_hasattr(args, args_key):
+                seen.add(args_key)
+                setattr(cfg, fld.name, safe_getattr(args, args_key))
+            if safe_hasattr(args, fld.name):
+                seen.add(fld.name)
+                setattr(cfg, fld.name, safe_getattr(args, fld.name))
+        return cfg
+
+    @classmethod
+    def from_namespace(cls, args):
+        if args is None:
+            return None
+        if not isinstance(args, cls):
+            seen = set()
+            config = cls()
+            # currently, we can go generically from DC fields to args hierarchically
+            # but we can't easily deconstruct a flat namespace to a hierarchical
+            # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not
+            # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple
+            # for now.
+            for fld in fields(cls):
+                # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields
+                # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()`
+                if fld.name == "decoder":
+                    if safe_hasattr(args, "decoder"):
+                        #  in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC
+                        seen.add("decoder")
+                        config.decoder = DecoderConfig(**args.decoder)
+                    else:
+                        config.decoder = cls._copy_keys(
+                            args, DecoderConfig, "decoder", seen
+                        )
+                elif fld.name == "encoder":
+                    # same but for encoder
+                    if safe_hasattr(args, "encoder"):
+                        seen.add("encoder")
+                        config.encoder = EncDecBaseConfig(**args.encoder)
+                    else:
+                        config.encoder = cls._copy_keys(
+                            args, EncDecBaseConfig, "encoder", seen
+                        )
+                elif fld.name == "quant_noise":
+                    # same but for quant_noise
+                    if safe_hasattr(args, "quant_noise"):
+                        seen.add("quant_noise")
+                        config.quant_noise = QuantNoiseConfig(**args.quant_noise)
+                    else:
+                        config.quant_noise = cls._copy_keys(
+                            args, QuantNoiseConfig, "quant_noise", seen
+                        )
+                elif safe_hasattr(args, fld.name):
+                    # if it's not a structure field, it's just a normal field, copy it over
+                    seen.add(fld.name)
+                    setattr(config, fld.name, safe_getattr(args, fld.name))
+            # we got all the fields defined in the dataclass, but
+            # the argparse namespace might have extra args for two reasons:
+            #   - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this
+            #   - some places expect args to be there but never define them
+            args_dict = (
+                args._asdict()
+                if safe_hasattr(args, "_asdict")
+                else vars(args)
+                if safe_hasattr(args, "__dict__")
+                else {}
+            )  # namedtupled doesn't have __dict__ :-/
+            for key, value in args_dict.items():
+                if key not in seen:
+                    setattr(config, key, value)
+            return config
+        else:
+            return args
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
new file mode 100644
index 0000000000..c22e5625d4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
@@ -0,0 +1,482 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import utils
+from fairseq.distributed import fsdp_wrap
+from fairseq.models import FairseqIncrementalDecoder
+from fairseq.models.transformer import TransformerConfig
+from fairseq.modules import (
+    AdaptiveSoftmax,
+    BaseLayer,
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    PositionalEmbedding,
+    SinusoidalPositionalEmbedding,
+    transformer_layer,
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+
+
+# rewrite name for backward compatibility in `make_generation_fast_`
+def module_name_fordropout(module_name: str) -> str:
+    if module_name == "TransformerDecoderBase":
+        return "TransformerDecoder"
+    else:
+        return module_name
+
+
+class TransformerDecoderBase(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *cfg.decoder.layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        cfg (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self,
+        cfg,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        output_projection=None,
+    ):
+        self.cfg = cfg
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+        self._future_mask = torch.empty(0)
+
+        self.dropout_module = FairseqDropout(
+            cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)
+        )
+        self.decoder_layerdrop = cfg.decoder.layerdrop
+        self.share_input_output_embed = cfg.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = cfg.decoder.embed_dim
+        self.embed_dim = embed_dim
+        self.output_embed_dim = cfg.decoder.output_dim
+
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = cfg.max_target_positions
+
+        self.embed_tokens = embed_tokens
+
+        self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim)
+
+        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                cfg.quant_noise.pq,
+                cfg.quant_noise.pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+        self.embed_positions = (
+            PositionalEmbedding(
+                self.max_target_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=cfg.decoder.learned_pos,
+            )
+            if not cfg.no_token_positional_embeddings
+            else None
+        )
+        if cfg.layernorm_embedding:
+            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
+        else:
+            self.layernorm_embedding = None
+
+        self.cross_self_attention = cfg.cross_self_attention
+
+        if self.decoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                self.build_decoder_layer(cfg, no_encoder_attn)
+                for _ in range(cfg.decoder.layers)
+            ]
+        )
+        self.num_layers = len(self.layers)
+
+        if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm:
+            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
+        else:
+            self.layer_norm = None
+
+        self.project_out_dim = (
+            Linear(embed_dim, self.output_embed_dim, bias=False)
+            if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights
+            else None
+        )
+
+        self.adaptive_softmax = None
+        self.output_projection = output_projection
+        if self.output_projection is None:
+            self.build_output_projection(cfg, dictionary, embed_tokens)
+
+    def build_output_projection(self, cfg, dictionary, embed_tokens):
+        if cfg.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary),
+                self.output_embed_dim,
+                utils.eval_str_list(cfg.adaptive_softmax_cutoff, type=int),
+                dropout=cfg.adaptive_softmax_dropout,
+                adaptive_inputs=embed_tokens if cfg.tie_adaptive_weights else None,
+                factor=cfg.adaptive_softmax_factor,
+                tie_proj=cfg.tie_adaptive_proj,
+            )
+        elif self.share_input_output_embed:
+            self.output_projection = nn.Linear(
+                self.embed_tokens.weight.shape[1],
+                self.embed_tokens.weight.shape[0],
+                bias=False,
+            )
+            self.output_projection.weight = self.embed_tokens.weight
+        else:
+            self.output_projection = nn.Linear(
+                self.output_embed_dim, len(dictionary), bias=False
+            )
+            nn.init.normal_(
+                self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5
+            )
+        num_base_layers = cfg.base_layers
+        for i in range(num_base_layers):
+            self.layers.insert(
+                ((i + 1) * cfg.decoder.layers) // (num_base_layers + 1),
+                BaseLayer(cfg),
+            )
+
+    def build_decoder_layer(self, cfg, no_encoder_attn=False):
+        layer = transformer_layer.TransformerDecoderLayerBase(cfg, no_encoder_attn)
+        checkpoint = cfg.checkpoint_activations
+        if checkpoint:
+            offload_to_cpu = cfg.offload_activations
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        # if we are checkpointing, enforce that FSDP always wraps the
+        # checkpointed layer, regardless of layer size
+        min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
+        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
+        return layer
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention, should be of size T x B x C
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        return self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+
+    """
+    A scriptable subclass of this class has an extract_features method and calls
+    super().extract_features, but super() is not supported in torchscript. A copy of
+    this function is made to be used in the subclass instead.
+    """
+
+    def extract_features_scriptable(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        bs, slen = prev_output_tokens.size()
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+
+        enc: Optional[Tensor] = None
+        padding_mask: Optional[Tensor] = None
+        if encoder_out is not None and len(encoder_out["encoder_out"]) > 0:
+            enc = encoder_out["encoder_out"][0]
+        if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0:
+            padding_mask = encoder_out["encoder_padding_mask"][0]
+
+        # embed positions
+        positions = None
+        if self.embed_positions is not None:
+            positions = self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # Prevent torchscript exporting issue for dynamic quant embedding
+        prev_output_tokens = prev_output_tokens.contiguous()
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        self_attn_padding_mask: Optional[Tensor] = None
+        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+
+        # decoder layers
+        attn: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn, _ = layer(
+                x,
+                enc,
+                padding_mask,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": [attn], "inner_states": inner_states}
+
+    def output_layer(self, features):
+        """Project features to the vocabulary size."""
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            return self.output_projection(features)
+        else:
+            return features
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
+        if (
+            self._future_mask.size(0) == 0
+            or (not self._future_mask.device == tensor.device)
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
+            )
+        self._future_mask = self._future_mask.to(tensor)
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+
+        if f"{name}.output_projection.weight" not in state_dict:
+            if self.share_input_output_embed:
+                embed_out_key = f"{name}.embed_tokens.weight"
+            else:
+                embed_out_key = f"{name}.embed_out"
+            if embed_out_key in state_dict:
+                state_dict[f"{name}.output_projection.weight"] = state_dict[
+                    embed_out_key
+                ]
+                if not self.share_input_output_embed:
+                    del state_dict[embed_out_key]
+
+        for i in range(self.num_layers):
+            # update layer norms
+            layer_norm_map = {
+                "0": "self_attn_layer_norm",
+                "1": "encoder_attn_layer_norm",
+                "2": "final_layer_norm",
+            }
+            for old, new in layer_norm_map.items():
+                for m in ("weight", "bias"):
+                    k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
+                    if k in state_dict:
+                        state_dict[
+                            "{}.layers.{}.{}.{}".format(name, i, new, m)
+                        ] = state_dict[k]
+                        del state_dict[k]
+
+        version_key = "{}.version".format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+
+        return state_dict
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
+
+
+class TransformerDecoder(TransformerDecoderBase):
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+        output_projection=None,
+    ):
+        self.args = args
+        super().__init__(
+            TransformerConfig.from_namespace(args),
+            dictionary,
+            embed_tokens,
+            no_encoder_attn=no_encoder_attn,
+            output_projection=output_projection,
+        )
+
+    def build_output_projection(self, args, dictionary, embed_tokens):
+        super().build_output_projection(
+            TransformerConfig.from_namespace(args), dictionary, embed_tokens
+        )
+
+    def build_decoder_layer(self, args, no_encoder_attn=False):
+        return super().build_decoder_layer(
+            TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
new file mode 100644
index 0000000000..c5e7101794
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
@@ -0,0 +1,392 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import utils
+from fairseq.distributed import fsdp_wrap
+from fairseq.models.transformer import TransformerConfig
+from fairseq.models.transformer.transformer_decoder import TransformerDecoderBase
+from fairseq.modules import (
+    LayerDropModuleList,
+    SinusoidalPositionalEmbedding,
+    transformer_layer_aug,
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+
+
+class AugTransformerDecoderBase(TransformerDecoderBase):
+    """
+    Transformer decoder augmented with an additional cross-attention. Each layer
+    is a :class:`AugTransformerDecoderLayerBase`.
+
+    Args:
+        cfg (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        encoder_attn_merge_type (str, optional): the way to combine outputs from
+            two cross-attention modules. If "sequential" is set, two cross-attention
+            modules are stacked sequentially. If "parallel" is set, they are processed
+            in parallel and combined before feeding it to FFN (default: sequential).
+        dropnet_ratio (float, optional): a probability to drop each cross-attention
+            module during training (default: 0.0).
+    """
+
+    def __init__(
+        self,
+        cfg,
+        dictionary,
+        embed_tokens,
+        output_projection=None,
+        encoder_attn_merge_type="sequential",
+        dropnet_ratio=0.0,
+    ):
+        super().__init__(
+            cfg,
+            dictionary,
+            embed_tokens,
+            no_encoder_attn=False,
+            output_projection=output_projection,
+        )
+        # assert cfg.cross_self_attention
+        self.cross_self_attention = cfg.cross_self_attention
+
+        if self.decoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                self.build_decoder_layer(cfg, encoder_attn_merge_type, dropnet_ratio)
+                for _ in range(cfg.decoder.layers)
+            ]
+        )
+
+    def build_decoder_layer(
+        self,
+        cfg,
+        encoder_attn_merge_type="sequential",
+        dropnet_ratio=0,
+    ):
+        layer = transformer_layer_aug.AugTransformerDecoderLayerBase(
+            cfg,
+            no_encoder_attn=False,
+            encoder_attn_merge_type=encoder_attn_merge_type,
+            dropnet_ratio=dropnet_ratio,
+        )
+        checkpoint = cfg.checkpoint_activations
+        if checkpoint:
+            offload_to_cpu = cfg.offload_activations
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        # if we are checkpointing, enforce that FSDP always wraps the
+        # checkpointed layer, regardless of layer size
+        min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
+        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
+        return layer
+
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention, should be of size T x B x C
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            encoder_out_aug=encoder_out_aug,
+            incremental_state=incremental_state,
+            full_context_alignment=full_context_alignment,
+            alignment_layer=alignment_layer,
+            alignment_heads=alignment_heads,
+        )
+
+        if not features_only:
+            x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        encoder_out_aug: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        return self.extract_features_scriptable(
+            prev_output_tokens,
+            encoder_out,
+            encoder_out_aug,
+            incremental_state,
+            full_context_alignment,
+            alignment_layer,
+            alignment_heads,
+        )
+
+    """
+    A scriptable subclass of this class has an extract_features method and calls
+    super().extract_features, but super() is not supported in torchscript. A copy of
+    this function is made to be used in the subclass instead.
+    """
+
+    def extract_features_scriptable(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        encoder_out_aug: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            full_context_alignment (bool, optional): don't apply
+                auto-regressive mask to self-attention (default: False).
+            alignment_layer (int, optional): return mean alignment over
+                heads at this layer (default: last layer).
+            alignment_heads (int, optional): only average alignment over
+                this many heads (default: all heads).
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        bs, slen = prev_output_tokens.size()
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+
+        enc: Optional[Tensor] = None
+        padding_mask: Optional[Tensor] = None
+        if encoder_out is not None and len(encoder_out["encoder_out"]) > 0:
+            enc = encoder_out["encoder_out"][0]
+        if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0:
+            padding_mask = encoder_out["encoder_padding_mask"][0]
+
+        enc_aug: Optional[Tensor] = None
+        padding_mask_aug: Optional[Tensor] = None
+        if encoder_out_aug is not None and len(encoder_out_aug["encoder_out"]) > 0:
+            enc_aug = encoder_out_aug["encoder_out"][0]
+        if (
+            encoder_out_aug is not None
+            and len(encoder_out_aug["encoder_padding_mask"]) > 0
+        ):
+            padding_mask_aug = encoder_out_aug["encoder_padding_mask"][0]
+
+        # embed positions
+        positions = None
+        if self.embed_positions is not None:
+            positions = self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # Prevent torchscript exporting issue for dynamic quant embedding
+        prev_output_tokens = prev_output_tokens.contiguous()
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        self_attn_padding_mask: Optional[Tensor] = None
+        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+
+        # decoder layers
+        attn: Optional[Tensor] = None
+        attn_aug: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn, layer_attn_aug, _ = layer(
+                x,
+                enc,
+                padding_mask,
+                enc_aug,
+                padding_mask_aug,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+            if layer_attn_aug is not None and idx == alignment_layer:
+                attn_aug = layer_attn_aug.float().to(x)
+
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+
+        if attn_aug is not None:
+            if alignment_heads is not None:
+                attn_aug = attn_aug[:alignment_heads]
+
+            # average probabilities over heads
+            attn_aug = attn_aug.mean(dim=0)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+
+        return x, {"attn": [attn], "attn_aug": [attn_aug], "inner_states": inner_states}
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+
+        if f"{name}.output_projection.weight" not in state_dict:
+            if self.share_input_output_embed:
+                embed_out_key = f"{name}.embed_tokens.weight"
+            else:
+                embed_out_key = f"{name}.embed_out"
+            if embed_out_key in state_dict:
+                state_dict[f"{name}.output_projection.weight"] = state_dict[
+                    embed_out_key
+                ]
+                if not self.share_input_output_embed:
+                    del state_dict[embed_out_key]
+
+        for i in range(self.num_layers):
+            # update layer norms
+            layer_norm_map = {
+                "0": "self_attn_layer_norm",
+                "1": "encoder_attn_layer_norm",
+                "2": "encoder_attn_layer_norm2",
+                "3": "final_layer_norm",
+            }
+            for old, new in layer_norm_map.items():
+                for m in ("weight", "bias"):
+                    k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
+                    if k in state_dict:
+                        state_dict[
+                            "{}.layers.{}.{}.{}".format(name, i, new, m)
+                        ] = state_dict[k]
+                        del state_dict[k]
+
+        version_key = "{}.version".format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+
+        return state_dict
+
+
+class AugTransformerDecoder(AugTransformerDecoderBase):
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        output_projection=None,
+    ):
+        self.args = args
+        super().__init__(
+            TransformerConfig.from_namespace(args),
+            dictionary,
+            embed_tokens,
+            no_encoder_attn=False,
+            output_projection=output_projection,
+            encoder_attn_merge_type=getattr(
+                args, "synthesizer_augmented_cross_attention_merge_type", "sequential"
+            ),
+            dropnet_ratio=getattr(args, "dropnet_ratio", 0),
+        )
+
+    def build_output_projection(self, args, dictionary, embed_tokens):
+        super().build_output_projection(
+            TransformerConfig.from_namespace(args), dictionary, embed_tokens
+        )
+
+    def build_decoder_layer(
+        self,
+        args,
+        encoder_attn_merge_type="sequential",
+        dropnet_ratio=0,
+    ):
+        return super().build_decoder_layer(
+            TransformerConfig.from_namespace(args),
+            no_encoder_attn=False,
+            encoder_attn_merge_type=encoder_attn_merge_type,
+            dropnet_ratio=dropnet_ratio,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
new file mode 100644
index 0000000000..17369ec8dc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
@@ -0,0 +1,370 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import utils
+from fairseq.distributed import fsdp_wrap
+from fairseq.models import FairseqEncoder
+from fairseq.models.transformer import TransformerConfig
+from fairseq.modules import (
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    PositionalEmbedding,
+    SinusoidalPositionalEmbedding,
+    transformer_layer,
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+
+
+# rewrite name for backward compatibility in `make_generation_fast_`
+def module_name_fordropout(module_name: str) -> str:
+    if module_name == "TransformerEncoderBase":
+        return "TransformerEncoder"
+    else:
+        return module_name
+
+
+class TransformerEncoderBase(FairseqEncoder):
+    """
+    Transformer encoder consisting of *cfg.encoder.layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+
+    def __init__(self, cfg, dictionary, embed_tokens, return_fc=False):
+        self.cfg = cfg
+        super().__init__(dictionary)
+        self.register_buffer("version", torch.Tensor([3]))
+
+        self.dropout_module = FairseqDropout(
+            cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)
+        )
+        self.encoder_layerdrop = cfg.encoder.layerdrop
+        self.return_fc = return_fc
+
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = cfg.max_source_positions
+
+        self.embed_tokens = embed_tokens
+
+        self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim)
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                cfg.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=cfg.encoder.learned_pos,
+            )
+            if not cfg.no_token_positional_embeddings
+            else None
+        )
+        if cfg.layernorm_embedding:
+            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
+        else:
+            self.layernorm_embedding = None
+
+        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(embed_dim, embed_dim, bias=False),
+                cfg.quant_noise.pq,
+                cfg.quant_noise.pq_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        if self.encoder_layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)]
+        )
+        self.num_layers = len(self.layers)
+
+        if cfg.encoder.normalize_before:
+            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
+        else:
+            self.layer_norm = None
+
+    def build_encoder_layer(self, cfg):
+        layer = transformer_layer.TransformerEncoderLayerBase(
+            cfg, return_fc=self.return_fc
+        )
+        checkpoint = cfg.checkpoint_activations
+        if checkpoint:
+            offload_to_cpu = cfg.offload_activations
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        # if we are checkpointing, enforce that FSDP always wraps the
+        # checkpointed layer, regardless of layer size
+        min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
+        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
+        return layer
+
+    def forward_embedding(
+        self, src_tokens, token_embedding: Optional[torch.Tensor] = None
+    ):
+        # embed tokens and positions
+        if token_embedding is None:
+            token_embedding = self.embed_tokens(src_tokens)
+        x = embed = self.embed_scale * token_embedding
+        if self.embed_positions is not None:
+            x = embed + self.embed_positions(src_tokens)
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+        x = self.dropout_module(x)
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+        return x, embed
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        return self.forward_scriptable(
+            src_tokens, src_lengths, return_all_hiddens, token_embeddings
+        )
+
+    # TorchScript doesn't support super() method so that the scriptable Subclass
+    # can't access the base class model in Torchscript.
+    # Current workaround is to add a helper function with different name and
+    # call the helper function from scriptable Subclass.
+    def forward_scriptable(
+        self,
+        src_tokens,
+        src_lengths: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        has_pads = (
+            torch.tensor(src_tokens.device.type == "xla") or encoder_padding_mask.any()
+        )
+        # Torchscript doesn't handle bool Tensor correctly, so we need to work around.
+        if torch.jit.is_scripting():
+            has_pads = torch.tensor(1) if has_pads else torch.tensor(0)
+
+        x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
+
+        # account for padding while computing the representation
+        x = x * (
+            1 - encoder_padding_mask.unsqueeze(-1).type_as(x) * has_pads.type_as(x)
+        )
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        encoder_states = []
+        fc_results = []
+
+        if return_all_hiddens:
+            encoder_states.append(x)
+
+        # encoder layers
+        for layer in self.layers:
+            lr = layer(
+                x, encoder_padding_mask=encoder_padding_mask if has_pads else None
+            )
+
+            if isinstance(lr, tuple) and len(lr) == 2:
+                x, fc_result = lr
+            else:
+                x = lr
+                fc_result = None
+
+            if return_all_hiddens and not torch.jit.is_scripting():
+                assert encoder_states is not None
+                encoder_states.append(x)
+                fc_results.append(fc_result)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
+        # `forward` so we use a dictionary instead.
+        # TorchScript does not support mixed values so the values are all lists.
+        # The empty list is equivalent to None.
+        src_lengths = (
+            src_tokens.ne(self.padding_idx)
+            .sum(dim=1, dtype=torch.int32)
+            .reshape(-1, 1)
+            .contiguous()
+        )
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask],  # B x T
+            "encoder_embedding": [encoder_embedding],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "fc_results": fc_results,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [src_lengths],
+        }
+
+    @torch.jit.export
+    def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        if len(encoder_out["encoder_out"]) == 0:
+            new_encoder_out = []
+        else:
+            new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)]
+        if len(encoder_out["encoder_padding_mask"]) == 0:
+            new_encoder_padding_mask = []
+        else:
+            new_encoder_padding_mask = [
+                encoder_out["encoder_padding_mask"][0].index_select(0, new_order)
+            ]
+        if len(encoder_out["encoder_embedding"]) == 0:
+            new_encoder_embedding = []
+        else:
+            new_encoder_embedding = [
+                encoder_out["encoder_embedding"][0].index_select(0, new_order)
+            ]
+
+        if len(encoder_out["src_tokens"]) == 0:
+            src_tokens = []
+        else:
+            src_tokens = [(encoder_out["src_tokens"][0]).index_select(0, new_order)]
+
+        if len(encoder_out["src_lengths"]) == 0:
+            src_lengths = []
+        else:
+            src_lengths = [(encoder_out["src_lengths"][0]).index_select(0, new_order)]
+
+        encoder_states = encoder_out["encoder_states"]
+        if len(encoder_states) > 0:
+            for idx, state in enumerate(encoder_states):
+                encoder_states[idx] = state.index_select(1, new_order)
+
+        return {
+            "encoder_out": new_encoder_out,  # T x B x C
+            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
+            "encoder_embedding": new_encoder_embedding,  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": src_tokens,  # B x T
+            "src_lengths": src_lengths,  # B x 1
+        }
+
+    @torch.jit.export
+    def _reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
+        """Dummy re-order function for beamable enc-dec attention"""
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            weights_key = "{}.embed_positions.weights".format(name)
+            if weights_key in state_dict:
+                print("deleting {0}".format(weights_key))
+                del state_dict[weights_key]
+            state_dict[
+                "{}.embed_positions._float_tensor".format(name)
+            ] = torch.FloatTensor(1)
+        for i in range(self.num_layers):
+            # update layer norms
+            self.layers[i].upgrade_state_dict_named(
+                state_dict, "{}.layers.{}".format(name, i)
+            )
+
+        version_key = "{}.version".format(name)
+        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
+            # earlier checkpoints did not normalize after the stack of layers
+            self.layer_norm = None
+            self.normalize = False
+            state_dict[version_key] = torch.Tensor([1])
+        return state_dict
+
+
+class TransformerEncoder(TransformerEncoderBase):
+    def __init__(self, args, dictionary, embed_tokens, return_fc=False):
+        self.args = args
+        super().__init__(
+            TransformerConfig.from_namespace(args),
+            dictionary,
+            embed_tokens,
+            return_fc=return_fc,
+        )
+
+    def build_encoder_layer(self, args):
+        return super().build_encoder_layer(
+            TransformerConfig.from_namespace(args),
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
new file mode 100644
index 0000000000..00d14a7dde
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
@@ -0,0 +1,277 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.models import (
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer.transformer_config import (
+    TransformerConfig,
+    DEFAULT_MAX_SOURCE_POSITIONS,
+    DEFAULT_MAX_TARGET_POSITIONS,
+    DEFAULT_MIN_PARAMS_TO_WRAP,
+)
+from fairseq.models.transformer.transformer_base import (
+    TransformerModelBase,
+)
+
+
+@register_model("transformer")
+class TransformerModel(TransformerModelBase):
+    """
+    This is the legacy implementation of the transformer model that
+    uses argparse for configuration.
+    """
+
+    @classmethod
+    def hub_models(cls):
+        # fmt: off
+
+        def moses_subword(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'subword_nmt',
+            }
+
+        def moses_fastbpe(path):
+            return {
+                'path': path,
+                'tokenizer': 'moses',
+                'bpe': 'fastbpe',
+            }
+
+        def spm(path):
+            return {
+                'path': path,
+                'bpe': 'sentencepiece',
+                'tokenizer': 'space',
+            }
+
+        return {
+            'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'),
+            'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
+            'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'),
+            'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'),
+            'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'),
+            'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'),
+            'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'),
+            'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'),
+            'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'),
+            'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'),
+            'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'),
+            'transformer.wmt20.en-ta': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-ta.single.tar.gz'),
+            'transformer.wmt20.en-iu.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.news.single.tar.gz'),
+            'transformer.wmt20.en-iu.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.nh.single.tar.gz'),
+            'transformer.wmt20.ta-en': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta-en.single.tar.gz'),
+            'transformer.wmt20.iu-en.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.news.single.tar.gz'),
+            'transformer.wmt20.iu-en.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.nh.single.tar.gz'),
+            'transformer.flores101.mm100.615M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz'),
+            'transformer.flores101.mm100.175M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz'),
+        }
+        # fmt: on
+
+    def __init__(self, args, encoder, decoder):
+        cfg = TransformerConfig.from_namespace(args)
+        super().__init__(cfg, encoder, decoder)
+        self.args = args
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add model-specific arguments to the parser."""
+        # we want to build the args recursively in this case.
+        # do not set defaults so that settings defaults from various architectures still works
+        gen_parser_from_dataclass(
+            parser, TransformerConfig(), delete_default=True, with_prefix=""
+        )
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if getattr(args, "max_source_positions", None) is None:
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise ValueError("--share-all-embeddings requires a joined dictionary")
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            args.share_decoder_input_output_embed = True
+
+        if getattr(args, "offload_activations", False):
+            args.checkpoint_activations = True  # offloading implies checkpointing
+
+        if not args.share_all_embeddings:
+            args.min_params_to_wrap = getattr(
+                args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP
+            )
+        cfg = TransformerConfig.from_namespace(args)
+        return super().build_model(cfg, task)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        return super().build_embedding(
+            TransformerConfig.from_namespace(args), dictionary, embed_dim, path
+        )
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return super().build_encoder(
+            TransformerConfig.from_namespace(args), src_dict, embed_tokens
+        )
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return super().build_decoder(
+            TransformerConfig.from_namespace(args), tgt_dict, embed_tokens
+        )
+
+
+# architectures
+
+
+@register_model_architecture("transformer", "transformer_tiny")
+def tiny_architecture(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 64)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 64)
+    args.encoder_layers = getattr(args, "encoder_layers", 2)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2)
+    args.decoder_layers = getattr(args, "decoder_layers", 2)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2)
+    return base_architecture(args)
+
+
+@register_model_architecture("transformer", "transformer")
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+
+    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.merge_src_tgt_embed = getattr(args, "merge_src_tgt_embed", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.no_cross_attention = getattr(args, "no_cross_attention", False)
+    args.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.checkpoint_activations = getattr(args, "checkpoint_activations", False)
+    args.offload_activations = getattr(args, "offload_activations", False)
+    if args.offload_activations:
+        args.checkpoint_activations = True
+    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
+    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0)
+
+
+@register_model_architecture("transformer", "transformer_iwslt_de_en")
+def transformer_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    base_architecture(args)
+
+
+@register_model_architecture("transformer", "transformer_wmt_en_de")
+def transformer_wmt_en_de(args):
+    base_architecture(args)
+
+
+# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
+def transformer_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    base_architecture(args)
+
+
+@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
+def transformer_vaswani_wmt_en_fr_big(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
+
+
+@register_model_architecture("transformer", "transformer_wmt_en_de_big")
+def transformer_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
+def transformer_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
new file mode 100644
index 0000000000..eaf585bd10
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    TransformerModel,
+    base_architecture,
+    transformer_wmt_en_de_big,
+)
+
+
+@register_model("transformer_align")
+class TransformerAlignModel(TransformerModel):
+    """
+    See "Jointly Learning to Align and Translate with Transformer
+    Models" (Garg et al., EMNLP 2019).
+    """
+
+    def __init__(self, encoder, decoder, args):
+        super().__init__(args, encoder, decoder)
+        self.alignment_heads = args.alignment_heads
+        self.alignment_layer = args.alignment_layer
+        self.full_context_alignment = args.full_context_alignment
+
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        super(TransformerAlignModel, TransformerAlignModel).add_args(parser)
+        parser.add_argument('--alignment-heads', type=int, metavar='D',
+                            help='Number of cross attention heads per layer to supervised with alignments')
+        parser.add_argument('--alignment-layer', type=int, metavar='D',
+                            help='Layer number which has to be supervised. 0 corresponding to the bottommost layer.')
+        parser.add_argument('--full-context-alignment', action='store_true',
+                            help='Whether or not alignment is supervised conditioned on the full target context.')
+        # fmt: on
+
+    @classmethod
+    def build_model(cls, args, task):
+        # set any default arguments
+        transformer_align(args)
+
+        transformer_model = TransformerModel.build_model(args, task)
+        return TransformerAlignModel(
+            transformer_model.encoder, transformer_model.decoder, args
+        )
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        encoder_out = self.encoder(src_tokens, src_lengths)
+        return self.forward_decoder(prev_output_tokens, encoder_out)
+
+    def forward_decoder(
+        self,
+        prev_output_tokens,
+        encoder_out=None,
+        incremental_state=None,
+        features_only=False,
+        **extra_args,
+    ):
+        attn_args = {
+            "alignment_layer": self.alignment_layer,
+            "alignment_heads": self.alignment_heads,
+        }
+        decoder_out = self.decoder(prev_output_tokens, encoder_out, **attn_args)
+
+        if self.full_context_alignment:
+            attn_args["full_context_alignment"] = self.full_context_alignment
+            _, alignment_out = self.decoder(
+                prev_output_tokens,
+                encoder_out,
+                features_only=True,
+                **attn_args,
+                **extra_args,
+            )
+            decoder_out[1]["attn"] = alignment_out["attn"]
+
+        return decoder_out
+
+
+@register_model_architecture("transformer_align", "transformer_align")
+def transformer_align(args):
+    args.alignment_heads = getattr(args, "alignment_heads", 1)
+    args.alignment_layer = getattr(args, "alignment_layer", 4)
+    args.full_context_alignment = getattr(args, "full_context_alignment", False)
+    base_architecture(args)
+
+
+@register_model_architecture("transformer_align", "transformer_wmt_en_de_big_align")
+def transformer_wmt_en_de_big_align(args):
+    args.alignment_heads = getattr(args, "alignment_heads", 1)
+    args.alignment_layer = getattr(args, "alignment_layer", 4)
+    transformer_wmt_en_de_big(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
new file mode 100644
index 0000000000..236d9942e1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any, Dict
+
+from fairseq import checkpoint_utils
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.transformer import (
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+    base_architecture as transformer_base_architecture,
+)
+
+
+@register_model("transformer_from_pretrained_xlm")
+class TransformerFromPretrainedXLMModel(TransformerModel):
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        TransformerModel.add_args(parser)
+        parser.add_argument(
+            "--pretrained-xlm-checkpoint",
+            type=str,
+            metavar="STR",
+            help="XLM model to use for initializing transformer encoder and/or decoder",
+        )
+        parser.add_argument(
+            "--init-encoder-only",
+            action="store_true",
+            help="if set, don't load the XLM weights and embeddings into decoder",
+        )
+        parser.add_argument(
+            "--init-decoder-only",
+            action="store_true",
+            help="if set, don't load the XLM weights and embeddings into encoder",
+        )
+
+    @classmethod
+    def build_model(self, args, task, cls_dictionary=MaskedLMDictionary):
+        assert hasattr(args, "pretrained_xlm_checkpoint"), (
+            "You must specify a path for --pretrained-xlm-checkpoint to use "
+            "--arch transformer_from_pretrained_xlm"
+        )
+        assert isinstance(task.source_dictionary, cls_dictionary) and isinstance(
+            task.target_dictionary, cls_dictionary
+        ), (
+            "You should use a MaskedLMDictionary when using --arch "
+            "transformer_from_pretrained_xlm because the pretrained XLM model "
+            "was trained using data binarized with MaskedLMDictionary. "
+            "For translation, you may want to use --task "
+            "translation_from_pretrained_xlm"
+        )
+        assert not (
+            getattr(args, "init_encoder_only", False)
+            and getattr(args, "init_decoder_only", False)
+        ), "Only one of --init-encoder-only and --init-decoder-only can be set."
+        return super().build_model(args, task)
+
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerEncoderFromPretrainedXLM(args, src_dict, embed_tokens)
+
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerDecoderFromPretrainedXLM(args, tgt_dict, embed_tokens)
+
+
+def upgrade_state_dict_with_xlm_weights(
+    state_dict: Dict[str, Any], pretrained_xlm_checkpoint: str
+) -> Dict[str, Any]:
+    """
+    Load XLM weights into a Transformer encoder or decoder model.
+
+    Args:
+        state_dict: state dict for either TransformerEncoder or
+            TransformerDecoder
+        pretrained_xlm_checkpoint: checkpoint to load XLM weights from
+
+    Raises:
+        AssertionError: If architecture (num layers, attention heads, etc.)
+            does not match between the current Transformer encoder or
+            decoder and the pretrained_xlm_checkpoint
+    """
+    if not os.path.exists(pretrained_xlm_checkpoint):
+        raise IOError("Model file not found: {}".format(pretrained_xlm_checkpoint))
+
+    state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_xlm_checkpoint)
+    xlm_state_dict = state["model"]
+    for key in xlm_state_dict.keys():
+
+        for search_key in ["embed_tokens", "embed_positions", "layers"]:
+            if search_key in key:
+                subkey = key[key.find(search_key) :]
+                assert subkey in state_dict, (
+                    "{} Transformer encoder / decoder "
+                    "state_dict does not contain {}. Cannot "
+                    "load {} from pretrained XLM checkpoint "
+                    "{} into Transformer.".format(
+                        str(state_dict.keys()), subkey, key, pretrained_xlm_checkpoint
+                    )
+                )
+
+                state_dict[subkey] = xlm_state_dict[key]
+    return state_dict
+
+
+class TransformerEncoderFromPretrainedXLM(TransformerEncoder):
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens)
+        if getattr(args, "init_decoder_only", False):
+            # Don't load XLM weights for encoder if --init-decoder-only
+            return
+
+        assert hasattr(args, "pretrained_xlm_checkpoint"), (
+            "--pretrained-xlm-checkpoint must be specified to load Transformer "
+            "encoder from pretrained XLM"
+        )
+        xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights(
+            state_dict=self.state_dict(),
+            pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint,
+        )
+        self.load_state_dict(xlm_loaded_state_dict, strict=True)
+
+
+class TransformerDecoderFromPretrainedXLM(TransformerDecoder):
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
+        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
+        if getattr(args, "init_encoder_only", False):
+            # Don't load XLM weights for decoder if --init-encoder-only
+            return
+        assert hasattr(args, "pretrained_xlm_checkpoint"), (
+            "--pretrained-xlm-checkpoint must be specified to load Transformer "
+            "decoder from pretrained XLM"
+        )
+
+        xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights(
+            state_dict=self.state_dict(),
+            pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint,
+        )
+        self.load_state_dict(xlm_loaded_state_dict, strict=True)
+
+
+@register_model_architecture(
+    "transformer_from_pretrained_xlm", "transformer_from_pretrained_xlm"
+)
+def base_architecture(args):
+    transformer_base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
new file mode 100644
index 0000000000..1e3aa72d38
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
@@ -0,0 +1,607 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from omegaconf import II
+
+from fairseq import options, utils
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import (
+    DEFAULT_MIN_PARAMS_TO_WRAP,
+    Embedding,
+    TransformerDecoder,
+)
+from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
+from fairseq.utils import safe_getattr, safe_hasattr
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@dataclass
+class TransformerLanguageModelConfig(FairseqDataclass):
+    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
+        default="relu", metadata={"help": "activation function to use"}
+    )
+    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
+    attention_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability for attention weights"}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
+    )
+    relu_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
+    )
+    decoder_embed_dim: int = field(
+        default=512, metadata={"help": "decoder embedding dimension"}
+    )
+    decoder_output_dim: int = field(
+        default=512, metadata={"help": "decoder output dimension"}
+    )
+    decoder_input_dim: int = field(
+        default=512, metadata={"help": "decoder input dimension"}
+    )
+    decoder_ffn_embed_dim: int = field(
+        default=2048, metadata={"help": "decoder embedding dimension for FFN"}
+    )
+    decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"})
+    decoder_attention_heads: int = field(
+        default=8, metadata={"help": "num decoder attention heads"}
+    )
+    decoder_normalize_before: bool = field(
+        default=False, metadata={"help": "apply layernorm before each decoder block"}
+    )
+    no_decoder_final_norm: bool = field(
+        default=False,
+        metadata={"help": "don't add an extra layernorm after the last decoder block"},
+    )
+    adaptive_softmax_cutoff: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "comma separated list of adaptive softmax cutoff points. "
+            "Must be used with adaptive_loss criterion"
+        },
+    )
+    adaptive_softmax_dropout: float = field(
+        default=0,
+        metadata={"help": "sets adaptive softmax dropout for the tail projections"},
+    )
+    adaptive_softmax_factor: float = field(
+        default=4, metadata={"help": "adaptive input factor"}
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, disables positional embeddings (outside self attention)"
+        },
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False, metadata={"help": "share decoder input and output embeddings"}
+    )
+    character_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, uses character embedding convolutions to produce token embeddings"
+        },
+    )
+    character_filters: str = field(
+        default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
+        metadata={"help": "size of character embeddings"},
+    )
+    character_embedding_dim: int = field(
+        default=4, metadata={"help": "size of character embeddings"}
+    )
+    char_embedder_highway_layers: int = field(
+        default=2,
+        metadata={"help": "number of highway layers for character token embeddder"},
+    )
+    adaptive_input: bool = field(
+        default=False, metadata={"help": "if set, uses adaptive input"}
+    )
+    adaptive_input_factor: float = field(
+        default=4, metadata={"help": "adaptive input factor"}
+    )
+    adaptive_input_cutoff: Optional[str] = field(
+        default=None,
+        metadata={"help": "comma separated list of adaptive input cutoff points."},
+    )
+    tie_adaptive_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, ties the weights of adaptive softmax and adaptive input"
+        },
+    )
+    tie_adaptive_proj: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, ties the projection weights of adaptive softmax and adaptive input"
+        },
+    )
+    decoder_learned_pos: bool = field(
+        default=False,
+        metadata={"help": "use learned positional embeddings in the decoder"},
+    )
+    layernorm_embedding: bool = field(
+        default=False, metadata={"help": "add layernorm to embedding"}
+    )
+    no_scale_embedding: bool = field(
+        default=False, metadata={"help": "if True, dont scale embeddings"}
+    )
+    checkpoint_activations: bool = field(
+        default=False, metadata={"help": "checkpoint activations at each layer"}
+    )
+    offload_activations: bool = field(
+        default=False,
+        metadata={"help": "move checkpointed activations to CPU after they are used."},
+    )
+    # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
+    decoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "LayerDrop probability for decoder"}
+    )
+    decoder_layers_to_keep: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "which layers to *keep* when pruning as a comma-separated list"
+        },
+    )
+    # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
+    quant_noise_pq: float = field(
+        default=0.0,
+        metadata={"help": "iterative PQ quantization noise at training time"},
+    )
+    quant_noise_pq_block_size: int = field(
+        default=8,
+        metadata={"help": "block size of quantization noise at training time"},
+    )
+    quant_noise_scalar: float = field(
+        default=0.0,
+        metadata={
+            "help": "scalar quantization noise and scalar quantization at training time"
+        },
+    )
+    # config for Fully Sharded Data Parallel (FSDP) training
+    min_params_to_wrap: int = field(
+        default=DEFAULT_MIN_PARAMS_TO_WRAP,
+        metadata={
+            "help": (
+                "minimum number of params for a layer to be wrapped with FSDP() when "
+                "training with --ddp-backend=fully_sharded. Smaller values will "
+                "improve memory efficiency, but may make torch.distributed "
+                "communication less efficient due to smaller input sizes. This option "
+                "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
+                "--offload-activations are passed."
+            )
+        },
+    )
+    # config for "BASE Layers: Simplifying Training of Large, Sparse Models"
+    base_layers: Optional[int] = field(
+        default=0, metadata={"help": "number of BASE layers in total"}
+    )
+    base_sublayers: Optional[int] = field(
+        default=1, metadata={"help": "number of sublayers in each BASE layer"}
+    )
+    base_shuffle: Optional[int] = field(
+        default=1,
+        metadata={"help": "shuffle tokens between workers before computing assignment"},
+    )
+    # NormFormer
+    scale_fc: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Insert LayerNorm between fully connected layers"},
+    )
+    scale_attn: Optional[bool] = field(
+        default=False, metadata={"help": "Insert LayerNorm after attention"}
+    )
+    scale_heads: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Learn a scale coefficient for each attention head"},
+    )
+    scale_resids: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Learn a scale coefficient for each residual connection"},
+    )
+
+    # xFormers arguments
+    decoder_xformers_att_config: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "config for xFormers library attention, defined in xformers.components.attention.AttentionConfig",
+        },
+    )
+
+    # options from other parts of the config
+    add_bos_token: bool = II("task.add_bos_token")
+    tokens_per_sample: int = II("task.tokens_per_sample")
+    max_target_positions: Optional[int] = II("task.max_target_positions")
+    tpu: bool = II("common.tpu")
+
+
+@register_model("transformer_lm", dataclass=TransformerLanguageModelConfig)
+class TransformerLanguageModel(FairseqLanguageModel):
+    @classmethod
+    def hub_models(cls):
+        def moses_fastbpe(path):
+            return {"path": path, "tokenizer": "moses", "bpe": "fastbpe"}
+
+        def spm(path):
+            return {"path": path, "tokenizer": "space", "bpe": "sentencepiece"}
+
+        return {
+            "transformer_lm.gbw.adaptive_huge": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2",
+            "transformer_lm.wiki103.adaptive": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2",
+            "transformer_lm.wmt19.en": moses_fastbpe(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2"
+            ),
+            "transformer_lm.wmt19.de": moses_fastbpe(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2"
+            ),
+            "transformer_lm.wmt19.ru": moses_fastbpe(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2"
+            ),
+            "transformer_lm.wmt20.en": spm(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.en.tar.gz"
+            ),
+            "transformer_lm.wmt20.ta": spm(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.ta.tar.gz"
+            ),
+            "transformer_lm.wmt20.iu.news": spm(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.news.tar.gz"
+            ),
+            "transformer_lm.wmt20.iu.nh": spm(
+                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.nh.tar.gz"
+            ),
+        }
+
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+
+        if safe_getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = safe_getattr(
+                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+            )
+
+        if args.character_embeddings:
+            embed_tokens = CharacterTokenEmbedder(
+                task.source_dictionary,
+                eval(args.character_filters),
+                args.character_embedding_dim,
+                args.decoder_embed_dim,
+                args.char_embedder_highway_layers,
+            )
+        elif args.adaptive_input:
+            embed_tokens = AdaptiveInput(
+                len(task.source_dictionary),
+                task.source_dictionary.pad(),
+                args.decoder_input_dim,
+                args.adaptive_input_factor,
+                args.decoder_embed_dim,
+                options.eval_str_list(args.adaptive_input_cutoff, type=int),
+                args.quant_noise_pq,
+                args.quant_noise_pq_block_size,
+            )
+        else:
+            embed_tokens = cls.build_embedding(
+                args, task.source_dictionary, args.decoder_input_dim
+            )
+
+        if args.tie_adaptive_weights:
+            assert args.adaptive_input
+            assert args.adaptive_input_factor == args.adaptive_softmax_factor
+            assert (
+                args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
+            ), "{} != {}".format(
+                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
+            )
+            assert args.decoder_input_dim == args.decoder_output_dim
+
+        decoder = TransformerDecoder(
+            args, task.target_dictionary, embed_tokens, no_encoder_attn=True
+        )
+        return cls(decoder)
+
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim, path=None):
+        embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
+        return embed_tokens
+
+
+def base_lm_architecture(args):
+    # backward compatibility for older model checkpoints
+    if safe_hasattr(args, "no_tie_adaptive_proj"):
+        # previous models defined --no-tie-adaptive-proj, so use the existence of
+        # that option to determine if this is an "old" model checkpoint
+        args.no_decoder_final_norm = True  # old models always set this to True
+        if args.no_tie_adaptive_proj is False:
+            args.tie_adaptive_proj = True
+    if safe_hasattr(args, "decoder_final_norm"):
+        args.no_decoder_final_norm = not args.decoder_final_norm
+
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0)
+
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 2048)
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 6)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8)
+    args.adaptive_softmax_cutoff = safe_getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0)
+    args.adaptive_softmax_factor = safe_getattr(args, "adaptive_softmax_factor", 4)
+    args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", False)
+    args.activation_fn = safe_getattr(args, "activation_fn", "relu")
+
+    args.decoder_layerdrop = safe_getattr(args, "decoder_layerdrop", 0)
+    args.decoder_layers_to_keep = safe_getattr(args, "decoder_layers_to_keep", None)
+    args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0)
+    args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8)
+    args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0)
+
+    args.base_layers = safe_getattr(args, "base_layers", 0)
+    args.base_sublayers = safe_getattr(args, "base_sublayers", 1)
+    args.base_shuffle = safe_getattr(args, "base_shuffle", False)
+
+    args.add_bos_token = safe_getattr(args, "add_bos_token", False)
+    args.no_token_positional_embeddings = safe_getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.share_decoder_input_output_embed = safe_getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.character_embeddings = safe_getattr(args, "character_embeddings", False)
+
+    args.decoder_output_dim = safe_getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = safe_getattr(
+        args, "decoder_input_dim", args.decoder_embed_dim
+    )
+
+    # Model training is not stable without this
+    args.decoder_normalize_before = True
+    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", False)
+
+    args.adaptive_input = safe_getattr(args, "adaptive_input", False)
+    args.adaptive_input_factor = safe_getattr(args, "adaptive_input_factor", 4)
+    args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", None)
+
+    args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", False)
+    args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", False)
+
+    args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", False)
+    args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False)
+    args.checkpoint_activations = safe_getattr(args, "checkpoint_activations", False)
+    args.offload_activations = safe_getattr(args, "offload_activations", False)
+    args.scale_fc = safe_getattr(args, "scale_fc", False)
+    args.scale_attn = safe_getattr(args, "scale_attn", False)
+    args.scale_heads = safe_getattr(args, "scale_heads", False)
+    args.scale_resids = safe_getattr(args, "scale_resids", False)
+    if args.offload_activations:
+        args.checkpoint_activations = True
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_big")
+def transformer_lm_big(args):
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_wiki103")
+@register_model_architecture("transformer_lm", "transformer_lm_baevski_wiki103")
+def transformer_lm_baevski_wiki103(args):
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 16)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8)
+    args.dropout = safe_getattr(args, "dropout", 0.3)
+    args.adaptive_input = safe_getattr(args, "adaptive_input", True)
+    args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", True)
+    args.adaptive_input_cutoff = safe_getattr(
+        args, "adaptive_input_cutoff", "20000,60000"
+    )
+    args.adaptive_softmax_cutoff = safe_getattr(
+        args, "adaptive_softmax_cutoff", "20000,60000"
+    )
+    args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0.2)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = safe_getattr(args, "activation_dropout", 0.1)
+    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True)
+    args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", True)
+    transformer_lm_big(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gbw")
+@register_model_architecture("transformer_lm", "transformer_lm_baevski_gbw")
+def transformer_lm_baevski_gbw(args):
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512)
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True)
+    transformer_lm_big(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt")
+def transformer_lm_gpt(args):
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 3072)
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12)
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt2_small")
+def transformer_lm_gpt2_small(args):
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt2_tiny")
+def transformer_lm_gpt2_tiny(args):
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 64)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 64)
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 2)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 1)
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt2_medium")
+def transformer_lm_gpt2_medium(args):
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1280)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 5120)
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 36)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 20)
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big")
+def transformer_lm_gpt2_big(args):
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1600)
+    args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 6400)
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 48)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 25)
+    args.dropout = safe_getattr(args, "dropout", 0.1)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big_wide")
+def transformer_lm_gpt2_big_wide(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192)
+    args.decoder_layers = getattr(args, "decoder_layers", 24)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt2_bigger")
+def transformer_lm_gpt2_bigger(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192)
+    args.decoder_layers = getattr(args, "decoder_layers", 48)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
+
+
+def base_gpt3_architecture(args):
+    args.decoder_input_dim = args.decoder_embed_dim
+    args.decoder_output_dim = args.decoder_embed_dim
+    args.decoder_ffn_embed_dim = safe_getattr(
+        args, "decoder_ffn_embed_dim", args.decoder_embed_dim * 4
+    )
+    # GPT-3 used learned positional embeddings, rather than sinusoidal
+    args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", True)
+    args.dropout = safe_getattr(args, "dropout", 0.0)
+    args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0)
+    args.activation_fn = safe_getattr(args, "activation_fn", "gelu")
+    args.share_decoder_input_output_embed = True
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_small")
+def transformer_lm_gpt3_small(args):
+    # 125M params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 12)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_medium")
+def transformer_lm_gpt3_medium(args):
+    # 350M params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_large")
+def transformer_lm_gpt3_large(args):
+    # 760M params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1536)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_xl")
+def transformer_lm_gpt3_xl(args):
+    # 1.3B params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 24)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2048)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_2_7")
+def transformer_lm_gpt3_2_7(args):
+    # 2.7B params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 32)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2560)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_6_7")
+def transformer_lm_gpt3_6_7(args):
+    # 6.7B params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 32)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 4096)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_13")
+def transformer_lm_gpt3_13(args):
+    # 13B params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 40)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 5120)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 40)
+    base_gpt3_architecture(args)
+
+
+@register_model_architecture("transformer_lm", "transformer_lm_gpt3_175")
+def transformer_lm_gpt3_175(args):
+    # 175B params
+    args.decoder_layers = safe_getattr(args, "decoder_layers", 96)
+    args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 12288)
+    args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 96)
+    base_gpt3_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
new file mode 100644
index 0000000000..0fc9ae4348
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
@@ -0,0 +1,408 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass, field
+from fairseq.models.fairseq_decoder import FairseqDecoder
+import numpy as np
+from typing import Optional, Dict, Any, List
+import torch
+from torch import nn
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.dataclass import ChoiceEnum
+from fairseq.models import (
+    FairseqLanguageModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.tasks.speech_ulm_task import SpeechUnitLanguageModelingTask
+from fairseq.models.transformer import Embedding, TransformerDecoder, Linear
+from fairseq.models.transformer_lm import TransformerLanguageModelConfig
+from torch import Tensor
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"])
+
+
+@dataclass
+class SpeechUnitLanguageModelConfig(TransformerLanguageModelConfig):
+    mask_unit_seg_prob: float = field(
+        default=0.0, metadata={"help": "probability to mask a segment of unit sequence"}
+    )
+    mask_unit_seg_leng: int = field(
+        default=5, metadata={"help": "length of unit segment mask"}
+    )
+    mask_unit_seg_type: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose unit mask length"}
+    )
+
+    mask_dur_prob: float = field(
+        default=0.0, metadata={"help": "probability to mask entire duration sequence"}
+    )
+    mask_dur_seg_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability to mask a segment of duration sequence"},
+    )
+    mask_dur_seg_leng: int = field(
+        default=5, metadata={"help": "length of duration segment mask"}
+    )
+    mask_dur_seg_type: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose duration mask length"}
+    )
+
+    mask_f0_prob: float = field(
+        default=0.0, metadata={"help": "probability to mask entire duration sequence"}
+    )
+    mask_f0_seg_prob: float = field(
+        default=0.0, metadata={"help": "probability to mask a segment of f0 sequence"}
+    )
+    mask_f0_seg_leng: int = field(
+        default=5, metadata={"help": "length of f0 segment mask"}
+    )
+    mask_f0_seg_type: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose f0 mask length"}
+    )
+
+
+@register_model("transformer_ulm", dataclass=SpeechUnitLanguageModelConfig)
+class TransformerUnitLanguageModel(FairseqLanguageModel):
+    def __init__(
+        self,
+        cfg: SpeechUnitLanguageModelConfig,
+        task: SpeechUnitLanguageModelingTask,
+        decoder: FairseqDecoder,
+    ):
+        super().__init__(decoder)
+        self.cfg = cfg
+
+        self.channel_names = task.channel_names
+        self.channel_sizes = task.channel_sizes
+
+        self.unit_mask_val = task.source_dictionary.unk()
+        self.dur_mask_val = (
+            task.source_duration_dictionary.unk() if task.cfg.discrete_duration else 0
+        )
+        self.f0_mask_val = (
+            task.source_f0_dictionary.unk() if task.cfg.discrete_f0 else 0
+        )
+
+        self.ignore_duration_input = task.cfg.ignore_duration_input
+        self.ignore_f0_input = task.cfg.ignore_f0_input
+
+    @classmethod
+    def build_model(cls, args, task):
+        base_ulm_architecture(args)
+
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = getattr(
+                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+            )
+
+        embed_tokens = Embedding(
+            len(task.source_dictionary),
+            args.decoder_input_dim,
+            padding_idx=task.source_dictionary.pad(),
+        )
+        embed_duration = None
+        if task.cfg.discrete_duration:
+            embed_duration = Embedding(
+                len(task.source_duration_dictionary),
+                args.decoder_input_dim,
+                padding_idx=0,  # duration uses 0 for padding
+            )
+        embed_f0 = None
+        if task.cfg.discrete_f0:
+            embed_f0 = Embedding(
+                len(task.source_f0_dictionary),
+                args.decoder_input_dim,
+                padding_idx=task.source_f0_dictionary.pad(),
+            )
+
+        decoder = MultiStreamTransformerDecoder(
+            args,
+            task.target_dictionary,
+            embed_tokens,
+            [embed_duration, embed_f0],
+            no_encoder_attn=True,
+            channel_sizes=task.channel_sizes,
+        )
+
+        return cls(args, task, decoder)
+
+    def apply_seg_dropout(self, inp, mask_prob, mask_leng, mask_type, mask_val):
+        B, T = inp.size()
+        if mask_prob > 0:
+            mask_indices = compute_mask_indices(
+                (B, T), None, mask_prob, mask_leng, mask_type  # may mask padding
+            )
+            mask_indices = torch.from_numpy(mask_indices).to(inp.device)
+            inp[mask_indices] = mask_val
+        else:
+            mask_indices = torch.zeros_like(inp).bool()
+        return inp, mask_indices
+
+    def apply_seq_dropout(self, inp, mask_prob, mask_val):
+        B, T = inp.size()
+        if mask_prob > 0:
+            mask_indices = np.random.uniform(0, 1, (B,)) < mask_prob
+            mask_indices = (
+                torch.from_numpy(mask_indices).to(inp.device).unsqueeze(1).expand(-1, T)
+            )
+            inp[mask_indices] = mask_val
+        else:
+            mask_indices = torch.zeros_like(inp).bool()
+        return inp, mask_indices
+
+    def apply_dropout(self, src_tokens, dur_src, f0_src):
+        src_tokens, unit_mask = self.apply_seg_dropout(
+            src_tokens,
+            self.cfg.mask_unit_seg_prob,
+            self.cfg.mask_unit_seg_leng,
+            self.cfg.mask_unit_seg_type,
+            self.unit_mask_val,
+        )
+
+        dur_src, dur_mask = self.apply_seq_dropout(
+            dur_src, self.cfg.mask_dur_prob, self.dur_mask_val
+        )
+        dur_src, _dur_mask = self.apply_seg_dropout(
+            dur_src,
+            self.cfg.mask_dur_seg_prob,
+            self.cfg.mask_dur_seg_leng,
+            self.cfg.mask_dur_seg_type,
+            self.dur_mask_val,
+        )
+        dur_mask = dur_mask.logical_or(_dur_mask)
+
+        f0_src, f0_mask = self.apply_seq_dropout(
+            f0_src, self.cfg.mask_f0_prob, self.f0_mask_val
+        )
+        f0_src, _f0_mask = self.apply_seg_dropout(
+            f0_src,
+            self.cfg.mask_f0_seg_prob,
+            self.cfg.mask_f0_seg_leng,
+            self.cfg.mask_f0_seg_type,
+            self.f0_mask_val,
+        )
+        f0_mask = f0_mask.logical_or(_f0_mask)
+
+        return src_tokens, unit_mask, dur_src, dur_mask, f0_src, f0_mask
+
+    def forward(
+        self,
+        src_tokens: torch.Tensor,
+        dur_src: torch.Tensor,
+        f0_src: torch.Tensor,
+        src_lengths: Optional[Any] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        if self.ignore_duration_input:
+            dur_src = torch.zeros_like(dur_src)
+
+        if self.ignore_f0_input:
+            f0_src = torch.zeros_like(f0_src)
+
+        if self.training:
+            (
+                src_tokens,
+                unit_mask,
+                dur_src,
+                dur_mask,
+                f0_src,
+                f0_mask,
+            ) = self.apply_dropout(src_tokens, dur_src, f0_src)
+        else:
+            unit_masks = dur_mask = f0_mask = None
+
+        prediction, _ = self.decoder(
+            prev_output_tokens=(src_tokens, dur_src, f0_src),
+            incremental_state=incremental_state,
+            src_lengths=src_lengths,
+            features_only=True,
+        )
+
+        result = dict(zip(self.channel_names, prediction))
+
+        return result
+
+
+def base_ulm_architecture(args):
+    from .transformer_lm import base_lm_architecture
+
+    base_lm_architecture(args)
+
+
+@register_model_architecture("transformer_ulm", "transformer_ulm_big")
+def transformer_ulm_big(args):
+    from .transformer_lm import transformer_lm_big
+
+    transformer_lm_big(args)
+    base_ulm_architecture(args)
+
+
+@register_model_architecture("transformer_ulm", "transformer_ulm_tiny")
+def transformer_ulm_tiny(args):
+    from .transformer_lm import transformer_lm_gpt2_tiny
+
+    transformer_lm_gpt2_tiny(args)
+    base_ulm_architecture(args)
+
+
+class MultiStreamTransformerDecoder(TransformerDecoder):
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+        embed_other_list,
+        no_encoder_attn,
+        channel_sizes,
+    ):
+        super().__init__(
+            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
+        )
+
+        # embed each channel and project if dimensions do not match
+        self.embed_other_list = torch.nn.ModuleList(embed_other_list)
+        self.proj_other_list = torch.nn.ModuleList()
+        dim = embed_tokens.embedding_dim
+        for embed_other in embed_other_list:
+            other_dim = 1 if embed_other is None else embed_other.embedding_dim
+            self.proj_other_list.append(
+                nn.Linear(other_dim, dim) if other_dim != dim else None
+            )
+
+        # tranformer output to prediction
+        self.channel_sizes = channel_sizes
+        self.project_out_dim = Linear(
+            embed_tokens.embedding_dim, sum(channel_sizes), bias=False
+        )
+
+    def extract_features_scriptable(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        full_context_alignment: bool = False,
+        alignment_layer: Optional[int] = None,
+        alignment_heads: Optional[int] = None,
+    ):
+        if alignment_layer is None:
+            alignment_layer = self.num_layers - 1
+
+        # XXX: first multi-channel change start
+        prev_output_tokens, *other_channels = prev_output_tokens
+        # XXX: first multi-channel change end
+
+        # embed positions
+        positions = None
+        if self.embed_positions is not None:
+            positions = self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            other_channels = [o[:, -1:] for o in other_channels]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        # XXX: second multi-channel change start
+        other_channels = [
+            o.unsqueeze(-1).to(dtype=x.dtype) if emb is None else emb(o)
+            for o, emb in zip(other_channels, self.embed_other_list)
+        ]
+        other_channels = [
+            o if proj_other is None else proj_other(o)
+            for o, proj_other in zip(other_channels, self.proj_other_list)
+        ]
+        for o in other_channels:
+            x = x + o
+        # XXX: second multi-channel change end
+
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+
+        x = self.dropout_module(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        self_attn_padding_mask: Optional[Tensor] = None
+        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+
+        # decoder layers
+        attn: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None and not full_context_alignment:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+
+            x, layer_attn, _ = layer(
+                x,
+                encoder_out["encoder_out"][0]
+                if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0)
+                else None,
+                encoder_out["encoder_padding_mask"][0]
+                if (
+                    encoder_out is not None
+                    and len(encoder_out["encoder_padding_mask"]) > 0
+                )
+                else None,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=bool((idx == alignment_layer)),
+                need_head_weights=bool((idx == alignment_layer)),
+            )
+            inner_states.append(x)
+            if layer_attn is not None and idx == alignment_layer:
+                attn = layer_attn.float().to(x)
+
+        if attn is not None:
+            if alignment_heads is not None:
+                attn = attn[:alignment_heads]
+
+            # average probabilities over heads
+            attn = attn.mean(dim=0)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.project_out_dim is not None:
+            x = self.project_out_dim(x)
+        else:
+            assert False
+
+        # XXX: the last change start
+        result = []
+        start = 0
+        for channel_size in self.channel_sizes:
+            end = start + channel_size
+            result.append(x[:, :, start:end])
+            start = end
+        assert end == x.size(-1)
+        # XXX: the last change end
+
+        return result, {"attn": [attn], "inner_states": inner_states}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
new file mode 100644
index 0000000000..bacbb7a95d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .wav2vec import *  # noqa
+from .wav2vec2 import *  # noqa
+from .wav2vec2_asr import *  # noqa
+from .wav2vec2_laser import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
new file mode 100644
index 0000000000..dd52d86242
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import torch.nn.functional as F
+
+
+def pad_to_multiple(x, multiple, dim=-1, value=0):
+    # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
+    if x is None:
+        return None, 0
+    tsz = x.size(dim)
+    m = tsz / multiple
+    remainder = math.ceil(m) * multiple - tsz
+    if m.is_integer():
+        return x, 0
+    pad_offset = (0,) * (-1 - dim) * 2
+
+    return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
new file mode 100644
index 0000000000..af6604da10
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
@@ -0,0 +1,630 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+import logging
+import math
+from typing import Optional, Tuple
+from omegaconf import II
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.modules import (
+    Fp32GroupNorm,
+    Fp32LayerNorm,
+    GumbelVectorQuantizer,
+    KmeansVectorQuantizer,
+    TransposeLast,
+)
+from fairseq.tasks import FairseqTask
+from fairseq.utils import buffered_arange
+
+
+logger = logging.getLogger(__name__)
+
+
+AGGREGATOR_CHOICES = ChoiceEnum(["cnn", "gru"])
+PROJECT_FEATURES_CHOICES = ChoiceEnum(["none", "same", "new"])
+ACTIVATION_CHOICES = ChoiceEnum(["relu", "gelu"])
+VQ_TYPE_CHOICES = ChoiceEnum(["none", "gumbel", "kmeans"])
+
+
+@dataclass
+class Wav2VecConfig(FairseqDataclass):
+    prediction_steps: int = field(
+        default=12, metadata={"help": "number of steps ahead to predict"}
+    )
+    sample_distance: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "sample distance from target. does not work properly with cross-sampling"
+        },
+    )
+    cross_sample_negatives: int = field(
+        default=0, metadata={"help": "num of cross sampled negatives"}
+    )
+    num_negatives: int = field(
+        default=10, metadata={"help": "num of sampled negatives"}
+    )
+    conv_feature_layers: str = field(
+        default="[(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)]",
+        metadata={
+            "help": "convolutional feature extraction layers [(dim, kernel_size, stride), ...]"
+        },
+    )
+    conv_aggregator_layers: str = field(
+        default="[(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]",
+        metadata={
+            "help": "convolutional aggregator layers [(dim, kernel_size, stride), ...]"
+        },
+    )
+    dropout: float = field(
+        default=0.0, metadata={"help": "dropout to apply within the model"}
+    )
+    dropout_features: float = field(
+        default=0.0, metadata={"help": "dropout to apply to the features"}
+    )
+    dropout_agg: float = field(
+        default=0.0, metadata={"help": "dropout to apply after aggregation step"}
+    )
+    aggregator: AGGREGATOR_CHOICES = field(
+        default="cnn", metadata={"help": "type of aggregator to use"}
+    )
+    gru_dim: int = field(default=512, metadata={"help": "GRU dimensionality"})
+    no_conv_bias: bool = field(
+        default=False, metadata={"help": "if set, does not learn bias for conv layers"}
+    )
+    agg_zero_pad: bool = field(
+        default=False,
+        metadata={"help": "if set, zero pads in aggregator instead of repl pad"},
+    )
+    skip_connections_feat: bool = field(
+        default=False,
+        metadata={"help": "if set, adds skip connections to the feature extractor"},
+    )
+    skip_connections_agg: bool = field(
+        default=True,
+        metadata={"help": "if set, adds skip connections to the aggregator"},
+    )
+    residual_scale: float = field(
+        default=0.5, metadata={"help": "scales residual by sqrt(value)"}
+    )
+    log_compression: bool = field(
+        default=True,
+        metadata={"help": "if set, adds a log compression to feature extractor"},
+    )
+    balanced_classes: bool = field(
+        default=False,
+        metadata={"help": "if set, loss is scaled to balance for number of negatives"},
+    )
+    project_features: PROJECT_FEATURES_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, features are projected using the (same or new) aggregator"
+        },
+    )
+    non_affine_group_norm: bool = field(
+        default=False, metadata={"help": "if set, group norm is not affine"}
+    )
+    offset: str = field(
+        default="auto",
+        metadata={
+            "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value"
+        },
+    )
+    activation: ACTIVATION_CHOICES = field(
+        default="relu",
+        metadata={
+            "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value"
+        },
+    )
+    vq_type: VQ_TYPE_CHOICES = field(
+        default="none", metadata={"help": "which type of quantizer to use"}
+    )
+    vq_vars: int = field(
+        default=320,
+        metadata={"help": "project to this many vector quantized variables per group"},
+    )
+    vq_groups: int = field(
+        default=2, metadata={"help": "number of groups of latent variables"}
+    )
+    vq_dim: int = field(
+        default=0,
+        metadata={
+            "help": "uses this dimensionality for quantized vectors. 0 to use model dim // groups"
+        },
+    )
+    vq_depth: int = field(
+        default=1, metadata={"help": "number of layers for vq weight projection"}
+    )
+    combine_groups: bool = field(
+        default=False, metadata={"help": "if set, variables are shared among groups"}
+    )
+    vq_temp: Tuple[float, float, float] = field(
+        default=(2.0, 0.5, 0.999995),
+        metadata={
+            "help": "temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)"
+        },
+    )
+    vq_gamma: float = field(
+        default=0.25,
+        metadata={"help": "gamma parameter for kmeans style vector quantization"},
+    )
+    infonce: bool = II("criterion.infonce")
+
+
+@register_model("wav2vec", dataclass=Wav2VecConfig)
+class Wav2VecModel(BaseFairseqModel):
+    @classmethod
+    def build_model(cls, cfg: Wav2VecConfig, task: FairseqTask):
+        """Build a new model instance."""
+
+        model = Wav2VecModel(cfg)
+        logger.info(model)
+        return model
+
+    def __init__(self, cfg: Wav2VecConfig):
+        super().__init__()
+
+        self.prediction_steps = cfg.prediction_steps
+        offset = cfg.offset
+
+        if cfg.activation == "relu":
+            activation = nn.ReLU()
+        elif cfg.activation == "gelu":
+            activation = nn.GELU()
+        else:
+            raise Exception("unknown activation " + cfg.activation)
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            log_compression=cfg.log_compression,
+            skip_connections=cfg.skip_connections_feat,
+            residual_scale=cfg.residual_scale,
+            non_affine_group_norm=cfg.non_affine_group_norm,
+            activation=activation,
+        )
+        embed = feature_enc_layers[-1][0]
+
+        self.vector_quantizer = None
+        if cfg.vq_type == "gumbel":
+            self.vector_quantizer = GumbelVectorQuantizer(
+                dim=embed,
+                num_vars=cfg.vq_vars,
+                temp=cfg.vq_temp,
+                groups=cfg.vq_groups,
+                combine_groups=cfg.combine_groups,
+                vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed,
+                time_first=False,
+                activation=activation,
+                weight_proj_depth=cfg.vq_depth,
+                weight_proj_factor=2,
+            )
+        elif cfg.vq_type == "kmeans":
+            self.vector_quantizer = KmeansVectorQuantizer(
+                dim=embed,
+                num_vars=cfg.vq_vars,
+                groups=cfg.vq_groups,
+                combine_groups=cfg.combine_groups,
+                vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed,
+                time_first=False,
+                gamma=cfg.vq_gamma,
+            )
+        else:
+            assert (
+                cfg.vq_type == "none" or cfg.vq_type is None
+            ), "Unknown quantizer type"
+
+        if cfg.offset == "auto":
+            jin = 0
+            rin = 0
+            for _, k, stride in feature_enc_layers:
+                if rin == 0:
+                    rin = k
+                rin = rin + (k - 1) * jin
+                if jin == 0:
+                    jin = stride
+                else:
+                    jin *= stride
+            offset = math.ceil(rin / jin)
+
+        offset = int(offset)
+
+        def make_aggregator():
+            if cfg.aggregator == "cnn":
+                agg_layers = eval(cfg.conv_aggregator_layers)
+                agg_dim = agg_layers[-1][0]
+                feature_aggregator = ConvAggegator(
+                    conv_layers=agg_layers,
+                    embed=embed,
+                    dropout=cfg.dropout,
+                    skip_connections=cfg.skip_connections_agg,
+                    residual_scale=cfg.residual_scale,
+                    non_affine_group_norm=cfg.non_affine_group_norm,
+                    conv_bias=not cfg.no_conv_bias,
+                    zero_pad=cfg.agg_zero_pad,
+                    activation=activation,
+                )
+            elif cfg.aggregator == "gru":
+                agg_dim = cfg.gru_dim
+                feature_aggregator = nn.Sequential(
+                    TransposeLast(),
+                    nn.GRU(
+                        input_size=embed,
+                        hidden_size=agg_dim,
+                        num_layers=1,
+                        dropout=cfg.dropout,
+                    ),
+                    TransposeLast(deconstruct_idx=0),
+                )
+            else:
+                raise Exception("unknown aggregator type " + cfg.aggregator)
+
+            return feature_aggregator, agg_dim
+
+        self.feature_aggregator, agg_dim = make_aggregator()
+
+        self.wav2vec_predictions = Wav2VecPredictionsModel(
+            in_dim=agg_dim,
+            out_dim=embed,
+            prediction_steps=cfg.prediction_steps,
+            n_negatives=cfg.num_negatives,
+            cross_sample_negatives=cfg.cross_sample_negatives,
+            sample_distance=cfg.sample_distance,
+            dropout=cfg.dropout,
+            offset=offset,
+            balanced_classes=cfg.balanced_classes,
+            infonce=cfg.infonce,
+        )
+
+        self.dropout_feats = nn.Dropout(p=cfg.dropout_features)
+        self.dropout_agg = nn.Dropout(p=cfg.dropout_agg)
+
+        if cfg.project_features == "none":
+            self.project_features = None
+        elif cfg.project_features == "same":
+            self.project_features = self.feature_aggregator
+        elif cfg.project_features == "new":
+            self.project_features, _ = make_aggregator()
+
+    def forward(self, source):
+        result = {}
+
+        features = self.feature_extractor(source)
+        if self.vector_quantizer:
+            q_res = self.vector_quantizer(features)
+            features = q_res["x"]
+            for k in q_res.keys():
+                if k != "x":
+                    result[k] = q_res[k]
+
+        x = self.dropout_feats(features)
+        x = self.feature_aggregator(x)
+        x = self.dropout_agg(x)
+
+        if self.project_features is not None:
+            features = self.project_features(features)
+        x, targets = self.wav2vec_predictions(x, features)
+        result["cpc_logits"] = x
+        result["cpc_targets"] = targets
+
+        return result
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return sys.maxsize
+
+    def get_logits(self, net_output):
+        logits = net_output["cpc_logits"]
+        return logits
+
+    def get_targets(self, sample, net_output):
+        t = net_output["cpc_targets"]
+        if isinstance(t, tuple):
+            t = t[0]
+        return t.contiguous()
+
+    def get_target_weights(self, targets, net_output):
+        targets = net_output["cpc_targets"]
+        if isinstance(targets, tuple) and targets[-1] is not None:
+            return targets[-1]
+        return None
+
+    def get_extra_losses(self, net_output):
+        loss = None
+        if "prob_perplexity" in net_output:
+            loss = net_output["num_vars"] - net_output["prob_perplexity"]
+        elif "kmeans_loss" in net_output:
+            loss = net_output["kmeans_loss"]
+
+        return loss
+
+
+def norm_block(is_layer_norm, dim, affine=True):
+    if is_layer_norm:
+        mod = nn.Sequential(
+            TransposeLast(),
+            Fp32LayerNorm(dim, elementwise_affine=affine),
+            TransposeLast(),
+        )
+    else:
+        mod = Fp32GroupNorm(1, dim, affine=affine)
+
+    return mod
+
+
+class ConvFeatureExtractionModel(nn.Module):
+    def __init__(
+        self,
+        conv_layers,
+        dropout,
+        log_compression,
+        skip_connections,
+        residual_scale,
+        non_affine_group_norm,
+        activation,
+    ):
+        super().__init__()
+
+        def block(n_in, n_out, k, stride):
+            return nn.Sequential(
+                nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
+                nn.Dropout(p=dropout),
+                norm_block(
+                    is_layer_norm=False, dim=n_out, affine=not non_affine_group_norm
+                ),
+                activation,
+            )
+
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for dim, k, stride in conv_layers:
+            self.conv_layers.append(block(in_d, dim, k, stride))
+            in_d = dim
+
+        self.log_compression = log_compression
+        self.skip_connections = skip_connections
+        self.residual_scale = math.sqrt(residual_scale)
+
+    def forward(self, x):
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+
+        for conv in self.conv_layers:
+            residual = x
+            x = conv(x)
+            if self.skip_connections and x.size(1) == residual.size(1):
+                tsz = x.size(2)
+                r_tsz = residual.size(2)
+                residual = residual[..., :: r_tsz // tsz][..., :tsz]
+                x = (x + residual) * self.residual_scale
+
+        if self.log_compression:
+            x = x.abs()
+            x = x + 1
+            x = x.log()
+
+        return x
+
+
+class ZeroPad1d(nn.Module):
+    def __init__(self, pad_left, pad_right):
+        super().__init__()
+        self.pad_left = pad_left
+        self.pad_right = pad_right
+
+    def forward(self, x):
+        return F.pad(x, (self.pad_left, self.pad_right))
+
+
+class ConvAggegator(nn.Module):
+    def __init__(
+        self,
+        conv_layers,
+        embed,
+        dropout,
+        skip_connections,
+        residual_scale,
+        non_affine_group_norm,
+        conv_bias,
+        zero_pad,
+        activation,
+    ):
+        super().__init__()
+
+        def block(n_in, n_out, k, stride):
+            # padding dims only really make sense for stride = 1
+            ka = k // 2
+            kb = ka - 1 if k % 2 == 0 else ka
+
+            pad = (
+                ZeroPad1d(ka + kb, 0) if zero_pad else nn.ReplicationPad1d((ka + kb, 0))
+            )
+
+            return nn.Sequential(
+                pad,
+                nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias),
+                nn.Dropout(p=dropout),
+                norm_block(False, n_out, affine=not non_affine_group_norm),
+                activation,
+            )
+
+        in_d = embed
+        self.conv_layers = nn.ModuleList()
+        self.residual_proj = nn.ModuleList()
+        for dim, k, stride in conv_layers:
+            if in_d != dim and skip_connections:
+                self.residual_proj.append(nn.Conv1d(in_d, dim, 1, bias=False))
+            else:
+                self.residual_proj.append(None)
+
+            self.conv_layers.append(block(in_d, dim, k, stride))
+            in_d = dim
+        self.conv_layers = nn.Sequential(*self.conv_layers)
+        self.skip_connections = skip_connections
+        self.residual_scale = math.sqrt(residual_scale)
+
+    def forward(self, x):
+        for rproj, conv in zip(self.residual_proj, self.conv_layers):
+            residual = x
+            x = conv(x)
+            if self.skip_connections:
+                if rproj is not None:
+                    residual = rproj(residual)
+                x = (x + residual) * self.residual_scale
+        return x
+
+
+class Wav2VecPredictionsModel(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        prediction_steps,
+        n_negatives,
+        cross_sample_negatives,
+        sample_distance,
+        dropout,
+        offset,
+        balanced_classes,
+        infonce,
+    ):
+        super().__init__()
+
+        self.n_negatives = n_negatives
+        self.cross_sample_negatives = cross_sample_negatives
+        self.sample_distance = sample_distance
+        self.project_to_steps = nn.ConvTranspose2d(
+            in_dim, out_dim, (1, prediction_steps)
+        )
+        self.dropout = nn.Dropout(p=dropout)
+        self.offset = offset
+        self.balanced_classes = balanced_classes
+        self.infonce = infonce
+
+    def sample_negatives(self, y):
+        bsz, fsz, tsz = y.shape
+
+        y = y.transpose(0, 1)  # BCT -> CBT
+        y = y.contiguous().view(fsz, -1)  # CBT => C(BxT)
+
+        cross_high = tsz * bsz
+        high = tsz if self.sample_distance is None else min(tsz, self.sample_distance)
+        assert high > 1
+
+        neg_idxs = torch.randint(low=0, high=high, size=(bsz, self.n_negatives * tsz))
+
+        with torch.no_grad():
+            if self.n_negatives > 0:
+                tszs = (
+                    buffered_arange(tsz)
+                    .unsqueeze(-1)
+                    .expand(-1, self.n_negatives)
+                    .flatten()
+                )
+
+                neg_idxs = torch.randint(
+                    low=0, high=high - 1, size=(bsz, self.n_negatives * tsz)
+                )
+                neg_idxs[neg_idxs >= tszs] += 1
+
+            if self.cross_sample_negatives > 0:
+                tszs = (
+                    buffered_arange(tsz)
+                    .unsqueeze(-1)
+                    .expand(-1, self.cross_sample_negatives)
+                    .flatten()
+                )
+
+                cross_neg_idxs = torch.randint(
+                    low=0,
+                    high=cross_high - 1,
+                    size=(bsz, self.cross_sample_negatives * tsz),
+                )
+                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
+
+        if self.n_negatives > 0:
+            for i in range(1, bsz):
+                neg_idxs[i] += i * high
+        else:
+            neg_idxs = cross_neg_idxs
+
+        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
+            neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
+
+        negs = y[..., neg_idxs.view(-1)]
+        negs = negs.view(
+            fsz, bsz, self.n_negatives + self.cross_sample_negatives, tsz
+        ).permute(
+            2, 1, 0, 3
+        )  # to NxBxCxT
+
+        return negs
+
+    def forward(self, x, y):
+
+        x = x.unsqueeze(-1)
+        x = self.project_to_steps(x)  # BxCxTxS
+        x = self.dropout(x)
+
+        negatives = self.sample_negatives(y)
+        y = y.unsqueeze(0)
+        targets = torch.cat([y, negatives], dim=0)  # Copies x B x C x T
+
+        copies = targets.size(0)
+        bsz, dim, tsz, steps = x.shape
+        steps = min(steps, tsz - self.offset)
+
+        predictions = x.new(
+            bsz * copies * (tsz - self.offset + 1) * steps
+            - ((steps + 1) * steps // 2) * copies * bsz
+        )
+        if self.infonce:
+            labels = predictions.new_full(
+                (predictions.shape[0] // copies,), 0, dtype=torch.long
+            )
+        else:
+            labels = torch.zeros_like(predictions)
+        weights = (
+            torch.full_like(labels, 1 / self.n_negatives)
+            if self.balanced_classes and not self.infonce
+            else None
+        )
+
+        start = end = 0
+        for i in range(steps):
+            offset = i + self.offset
+            end = start + (tsz - offset) * bsz * copies
+            if self.infonce:
+                predictions[start:end] = torch.einsum(
+                    "bct,nbct->tbn", x[..., :-offset, i], targets[..., offset:]
+                ).flatten()
+            else:
+                pos_num = (end - start) // copies
+                predictions[start:end] = torch.einsum(
+                    "bct,nbct->nbt", x[..., :-offset, i], targets[..., offset:]
+                ).flatten()
+                labels[start : start + pos_num] = 1.0
+                if weights is not None:
+                    weights[start : start + pos_num] = 1.0
+            start = end
+        assert end == predictions.numel(), "{} != {}".format(end, predictions.numel())
+
+        if self.infonce:
+            predictions = predictions.view(-1, copies)
+        else:
+            if weights is not None:
+                labels = (labels, weights)
+
+        return predictions, labels
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
new file mode 100644
index 0000000000..8214bc8e7a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
@@ -0,0 +1,1284 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import utils
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.distributed import fsdp_wrap
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.modules import (
+    Fp32GroupNorm,
+    Fp32LayerNorm,
+    GradMultiply,
+    GumbelVectorQuantizer,
+    LayerNorm,
+    MultiheadAttention,
+    RelPositionalEncoding,
+    SamePad,
+    TransposeLast,
+)
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+from fairseq.modules.conformer_layer import ConformerWav2Vec2EncoderLayer
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from fairseq.utils import buffered_arange, index_put, is_xla_tensor
+
+from .utils import pad_to_multiple
+
+EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"])
+MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"])
+LAYER_TYPE_CHOICES = ChoiceEnum(["transformer", "conformer"])
+
+
+@dataclass
+class Wav2Vec2Config(FairseqDataclass):
+    extractor_mode: EXTRACTOR_MODE_CHOICES = field(
+        default="default",
+        metadata={
+            "help": "mode for feature extractor. default has a single group norm with d "
+            "groups in the first conv block, whereas layer_norm has layer norms in "
+            "every block (meant to use with normalize=True)"
+        },
+    )
+    encoder_layers: int = field(
+        default=12, metadata={"help": "num encoder layers in the transformer"}
+    )
+    encoder_embed_dim: int = field(
+        default=768, metadata={"help": "encoder embedding dimension"}
+    )
+    encoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "encoder embedding dimension for FFN"}
+    )
+    encoder_attention_heads: int = field(
+        default=12, metadata={"help": "num encoder attention heads"}
+    )
+    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
+        default="gelu", metadata={"help": "activation function to use"}
+    )
+    layer_type: LAYER_TYPE_CHOICES = field(
+        default="transformer", metadata={"help": "layer type in encoder"}
+    )
+    # dropouts
+    dropout: float = field(
+        default=0.1, metadata={"help": "dropout probability for the transformer"}
+    )
+    attention_dropout: float = field(
+        default=0.1, metadata={"help": "dropout probability for attention weights"}
+    )
+    activation_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability after activation in FFN"}
+    )
+    encoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"}
+    )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"},
+    )
+    dropout_features: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the features (after feat extr)"},
+    )
+
+    final_dim: int = field(
+        default=0,
+        metadata={
+            "help": "project final representations and targets to this many dimensions."
+            "set to encoder_embed_dim is <= 0"
+        },
+    )
+    layer_norm_first: bool = field(
+        default=False, metadata={"help": "apply layernorm first in the transformer"}
+    )
+    conv_feature_layers: str = field(
+        default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+        metadata={
+            "help": "string describing convolutional feature extraction layers in form of a python list that contains "
+            "[(dim, kernel_size, stride), ...]"
+        },
+    )
+    conv_bias: bool = field(
+        default=False, metadata={"help": "include bias in conv encoder"}
+    )
+    logit_temp: float = field(
+        default=0.1, metadata={"help": "temperature to divide logits by"}
+    )
+    quantize_targets: bool = field(
+        default=False, metadata={"help": "use quantized targets"}
+    )
+    quantize_input: bool = field(
+        default=False, metadata={"help": "use quantized inputs"}
+    )
+    same_quantizer: bool = field(
+        default=False, metadata={"help": "use same quantizer for inputs and targets"}
+    )
+    target_glu: bool = field(
+        default=False, metadata={"help": "adds projection + glu to targets"}
+    )
+    feature_grad_mult: float = field(
+        default=1.0, metadata={"help": "multiply feature extractor var grads by this"}
+    )
+    quantizer_depth: int = field(
+        default=1,
+        metadata={"help": "number of quantizer layers"},
+    )
+    quantizer_factor: int = field(
+        default=3,
+        metadata={
+            "help": "dimensionality increase for inner quantizer layers (if depth > 1)"
+        },
+    )
+    latent_vars: int = field(
+        default=320,
+        metadata={"help": "number of latent variables V in each group of the codebook"},
+    )
+    latent_groups: int = field(
+        default=2,
+        metadata={"help": "number of groups G of latent variables in the codebook"},
+    )
+    latent_dim: int = field(
+        default=0,
+        metadata={
+            "help": "if > 0, uses this dimensionality for latent variables. "
+            "otherwise uses final_dim / latent_groups"
+        },
+    )
+
+    # masking
+    mask_length: int = field(default=10, metadata={"help": "mask length"})
+    mask_prob: float = field(
+        default=0.65, metadata={"help": "probability of replacing a token with mask"}
+    )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose mask length"}
+    )
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument (used for more complex distributions), "
+            "see help in compute_mask_indices"
+        },
+    )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"}
+    )
+    mask_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+    require_same_masks: bool = field(
+        default=True,
+        metadata={
+            "help": "whether to number of masked timesteps must be the same across all "
+            "examples in a batch"
+        },
+    )
+    mask_dropout: float = field(
+        default=0.0,
+        metadata={"help": "percent of masks to unmask for each sample"},
+    )
+
+    # channel masking
+    mask_channel_length: int = field(
+        default=10, metadata={"help": "length of the mask for features (channels)"}
+    )
+    mask_channel_prob: float = field(
+        default=0.0, metadata={"help": "probability of replacing a feature with 0"}
+    )
+    mask_channel_before: bool = False
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"},
+    )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument (used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        },
+    )
+    no_mask_channel_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow channel masks to overlap"}
+    )
+    mask_channel_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+
+    # negative selection
+    num_negatives: int = field(
+        default=100,
+        metadata={"help": "number of negative examples from the same sample"},
+    )
+    negatives_from_everywhere: bool = field(
+        default=False,
+        metadata={"help": "sample negatives from everywhere, not just masked states"},
+    )
+    cross_sample_negatives: int = field(
+        default=0, metadata={"help": "number of negative examples from the any sample"}
+    )
+    codebook_negatives: int = field(
+        default=0, metadata={"help": "number of negative examples codebook"}
+    )
+
+    # positional embeddings
+    conv_pos: int = field(
+        default=128,
+        metadata={"help": "number of filters for convolutional positional embeddings"},
+    )
+    conv_pos_groups: int = field(
+        default=16,
+        metadata={"help": "number of groups for convolutional positional embedding"},
+    )
+    pos_conv_depth: int = field(
+        default=1,
+        metadata={"help": "depth of positional encoder network"},
+    )
+
+    latent_temp: Tuple[float, float, float] = field(
+        default=(2, 0.5, 0.999995),
+        metadata={
+            "help": "temperature for latent variable sampling. "
+            "can be tuple of 3 values (start, end, decay)"
+        },
+    )
+    max_positions: int = field(default=100000, metadata={"help": "Max positions"})
+    checkpoint_activations: bool = field(
+        default=False,
+        metadata={"help": "recompute activations and save memory for extra compute"},
+    )
+
+    # FP16 optimization
+    required_seq_len_multiple: int = field(
+        default=2,
+        metadata={
+            "help": "pad the input to encoder such that the sequence length is divisible by multiple"
+        },
+    )
+    crop_seq_to_multiple: int = field(
+        default=1,
+        metadata={
+            "help": "crop convolutional feature extractor output such that the sequence length is divisible by multiple"
+        },
+    )
+
+    # Conformer
+    depthwise_conv_kernel_size: int = field(
+        default=31,
+        metadata={
+            "help": "depthwise-conv-kernel-size for convolution in conformer layer"
+        },
+    )
+    attn_type: str = field(
+        default="",
+        metadata={"help": "if espnet use ESPNET MHA"},
+    )
+    pos_enc_type: str = field(
+        default="abs",
+        metadata={"help": "Positional encoding type to use in conformer"},
+    )
+    fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"})
+
+
+@register_model("wav2vec2", dataclass=Wav2Vec2Config)
+class Wav2Vec2Model(BaseFairseqModel):
+    def __init__(self, cfg: Wav2Vec2Config):
+        super().__init__()
+        self.cfg = cfg
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.embed = feature_enc_layers[-1][0]
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim and not cfg.quantize_input
+            else None
+        )
+
+        self.crop_seq_to_multiple = cfg.crop_seq_to_multiple
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+
+        self.quantizer = None
+        self.input_quantizer = None
+
+        self.n_negatives = cfg.num_negatives
+        self.cross_sample_negatives = cfg.cross_sample_negatives
+        self.codebook_negatives = cfg.codebook_negatives
+        self.negatives_from_everywhere = cfg.negatives_from_everywhere
+
+        self.logit_temp = cfg.logit_temp
+
+        final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
+
+        if cfg.quantize_targets:
+            vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else final_dim
+            self.quantizer = GumbelVectorQuantizer(
+                dim=self.embed,
+                num_vars=cfg.latent_vars,
+                temp=cfg.latent_temp,
+                groups=cfg.latent_groups,
+                combine_groups=False,
+                vq_dim=vq_dim,
+                time_first=True,
+                weight_proj_depth=cfg.quantizer_depth,
+                weight_proj_factor=cfg.quantizer_factor,
+            )
+            self.project_q = nn.Linear(vq_dim, final_dim)
+        else:
+            self.project_q = nn.Linear(self.embed, final_dim)
+
+        if cfg.quantize_input:
+            if cfg.same_quantizer and self.quantizer is not None:
+                vq_dim = final_dim
+                self.input_quantizer = self.quantizer
+            else:
+                vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else cfg.encoder_embed_dim
+                self.input_quantizer = GumbelVectorQuantizer(
+                    dim=self.embed,
+                    num_vars=cfg.latent_vars,
+                    temp=cfg.latent_temp,
+                    groups=cfg.latent_groups,
+                    combine_groups=False,
+                    vq_dim=vq_dim,
+                    time_first=True,
+                    weight_proj_depth=cfg.quantizer_depth,
+                    weight_proj_factor=cfg.quantizer_factor,
+                )
+            self.project_inp = nn.Linear(vq_dim, cfg.encoder_embed_dim)
+
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        encoder_cls = TransformerEncoder
+        if cfg.layer_type == "conformer" and cfg.pos_enc_type in ["rel_pos", "rope"]:
+            encoder_cls = ConformerEncoder
+
+        self.encoder = encoder_cls(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+
+        self.target_glu = None
+        if cfg.target_glu:
+            self.target_glu = nn.Sequential(
+                nn.Linear(final_dim, final_dim * 2), nn.GLU()
+            )
+
+        self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+    @classmethod
+    def build_model(cls, cfg: Wav2Vec2Config, task=None):
+        """Build a new model instance."""
+
+        return cls(cfg)
+
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=2,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices)
+                    .to(x.device)
+                    .unsqueeze(1)
+                    .expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+
+        return x, mask_indices
+
+    def sample_negatives(self, y, num, padding_count=None):
+
+        if self.n_negatives == 0 and self.cross_sample_negatives == 0:
+            return y.new(0)
+
+        bsz, tsz, fsz = y.shape
+        y = y.view(-1, fsz)  # BTC => (BxT)C
+
+        # FIXME: what happens if padding_count is specified?
+        cross_high = tsz * bsz
+        high = tsz - (padding_count or 0)
+        with torch.no_grad():
+            assert high > 1, f"{bsz,tsz,fsz}"
+
+            if self.n_negatives > 0:
+                tszs = (
+                    buffered_arange(num)
+                    .unsqueeze(-1)
+                    .expand(-1, self.n_negatives)
+                    .flatten()
+                )
+
+                neg_idxs = torch.randint(
+                    low=0, high=high - 1, size=(bsz, self.n_negatives * num)
+                )
+                neg_idxs[neg_idxs >= tszs] += 1
+
+            if self.cross_sample_negatives > 0:
+                tszs = (
+                    buffered_arange(num)
+                    .unsqueeze(-1)
+                    .expand(-1, self.cross_sample_negatives)
+                    .flatten()
+                )
+
+                cross_neg_idxs = torch.randint(
+                    low=0,
+                    high=cross_high - 1,
+                    size=(bsz, self.cross_sample_negatives * num),
+                )
+                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
+
+        if self.n_negatives > 0:
+            neg_idxs = neg_idxs + (torch.arange(bsz).unsqueeze(1) * high)
+        else:
+            neg_idxs = cross_neg_idxs
+
+        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
+            neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
+
+        negs = y[neg_idxs.view(-1)]
+        negs = negs.view(
+            bsz, num, self.n_negatives + self.cross_sample_negatives, fsz
+        ).permute(
+            2, 0, 1, 3
+        )  # to NxBxTxC
+        return negs, neg_idxs
+
+    def compute_preds(self, x, y, negatives):
+
+        neg_is_pos = (y == negatives).all(-1)
+        y = y.unsqueeze(0)
+        targets = torch.cat([y, negatives], dim=0)
+
+        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1)
+        logits = logits / self.logit_temp
+        logits = logits.type_as(x)
+
+        if is_xla_tensor(logits) or neg_is_pos.any():
+            if not hasattr(self, "_inftensor"):
+                fillval = -float(2**30)
+                self._inftensor = (
+                    torch.tensor(fillval).to(x.device)
+                    if is_xla_tensor(logits)
+                    else float("-inf")
+                )
+            logits[1:] = index_put(logits[1:], neg_is_pos, self._inftensor)
+
+        return logits
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(
+                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
+            )
+
+        return input_lengths.to(torch.long)
+
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(source)
+
+        features_pen = features.float().pow(2).mean()
+
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        unmasked_features = features.clone()
+
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+
+            padding_mask = torch.zeros(
+                features.shape[:2], dtype=features.dtype, device=features.device
+            )
+
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+
+        time_steps_to_drop = features.size(1) % self.crop_seq_to_multiple
+        if time_steps_to_drop != 0:
+            features = features[:, :-time_steps_to_drop]
+            unmasked_features = unmasked_features[:, :-time_steps_to_drop]
+            if padding_mask is not None:
+                padding_mask = padding_mask[:, :-time_steps_to_drop]
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+
+        num_vars = None
+        code_ppl = None
+        prob_ppl = None
+        curr_temp = None
+
+        if self.input_quantizer:
+            q = self.input_quantizer(features, produce_targets=False)
+            features = q["x"]
+            num_vars = q["num_vars"]
+            code_ppl = q["code_perplexity"]
+            prob_ppl = q["prob_perplexity"]
+            curr_temp = q["temp"]
+            features = self.project_inp(features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+            if not is_xla_tensor(x) and mask_indices is not None:
+                # tpu-comment: reducing the size in a dynamic way causes
+                # too many recompilations on xla.
+                y = unmasked_features[mask_indices].view(
+                    unmasked_features.size(0), -1, unmasked_features.size(-1)
+                )
+            else:
+                y = unmasked_features
+        else:
+            x = features
+            y = unmasked_features
+            mask_indices = None
+
+        x, layer_results = self.encoder(x, padding_mask=padding_mask, layer=layer)
+
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "features": unmasked_features,
+                "layer_results": layer_results,
+            }
+
+        if self.quantizer:
+            if self.negatives_from_everywhere:
+                q = self.quantizer(unmasked_features, produce_targets=False)
+                y = q["x"]
+                num_vars = q["num_vars"]
+                code_ppl = q["code_perplexity"]
+                prob_ppl = q["prob_perplexity"]
+                curr_temp = q["temp"]
+                y = self.project_q(y)
+
+                negs, _ = self.sample_negatives(
+                    y,
+                    mask_indices[0].sum(),
+                    padding_count=padding_count,
+                )
+                y = y[mask_indices].view(y.size(0), -1, y.size(-1))
+
+            else:
+                q = self.quantizer(y, produce_targets=False)
+                y = q["x"]
+                num_vars = q["num_vars"]
+                code_ppl = q["code_perplexity"]
+                prob_ppl = q["prob_perplexity"]
+                curr_temp = q["temp"]
+
+                y = self.project_q(y)
+
+                negs, _ = self.sample_negatives(
+                    y,
+                    y.size(1),
+                    padding_count=padding_count,
+                )
+
+            if self.codebook_negatives > 0:
+                cb_negs = self.quantizer.sample_from_codebook(
+                    y.size(0) * y.size(1), self.codebook_negatives
+                )
+                cb_negs = cb_negs.view(
+                    self.codebook_negatives, y.size(0), y.size(1), -1
+                )  # order doesnt matter
+                cb_negs = self.project_q(cb_negs)
+                negs = torch.cat([negs, cb_negs], dim=0)
+        else:
+            y = self.project_q(y)
+
+            if self.negatives_from_everywhere:
+                negs, _ = self.sample_negatives(
+                    unmasked_features,
+                    y.size(1),
+                    padding_count=padding_count,
+                )
+                negs = self.project_q(negs)
+            else:
+                negs, _ = self.sample_negatives(
+                    y,
+                    y.size(1),
+                    padding_count=padding_count,
+                )
+
+        if not is_xla_tensor(x):
+            # tpu-comment: reducing the size in a dynamic way causes
+            # too many recompilations on xla.
+            x = x[mask_indices].view(x.size(0), -1, x.size(-1))
+
+        if self.target_glu:
+            y = self.target_glu(y)
+            negs = self.target_glu(negs)
+
+        x = self.final_proj(x)
+        x = self.compute_preds(x, y, negs)
+
+        result = {
+            "x": x,
+            "padding_mask": padding_mask,
+            "features_pen": features_pen,
+        }
+
+        if prob_ppl is not None:
+            result["prob_perplexity"] = prob_ppl
+            result["code_perplexity"] = code_ppl
+            result["num_vars"] = num_vars
+            result["temp"] = curr_temp
+
+        return result
+
+    def quantize(self, x):
+        assert self.quantizer is not None
+        x = self.feature_extractor(x)
+        x = x.transpose(1, 2)
+        x = self.layer_norm(x)
+        return self.quantizer.forward_idx(x)
+
+    def extract_features(self, source, padding_mask, mask=False, layer=None):
+        res = self.forward(
+            source, padding_mask, mask=mask, features_only=True, layer=layer
+        )
+        return res
+
+    def get_logits(self, net_output):
+        logits = net_output["x"]
+        logits = logits.transpose(0, 2)
+        logits = logits.reshape(-1, logits.size(-1))
+        return logits
+
+    def get_targets(self, sample, net_output, expand_steps=True):
+        x = net_output["x"]
+        return x.new_zeros(x.size(1) * x.size(2), dtype=torch.long)
+
+    def get_extra_losses(self, net_output):
+        pen = []
+
+        if "prob_perplexity" in net_output:
+            pen.append(
+                (net_output["num_vars"] - net_output["prob_perplexity"])
+                / net_output["num_vars"]
+            )
+
+        if "features_pen" in net_output:
+            pen.append(net_output["features_pen"])
+
+        return pen
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.quantizer = None
+        self.project_q = None
+        self.target_glu = None
+        self.final_proj = None
+
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )
+
+
+class ConvFeatureExtractionModel(nn.Module):
+    def __init__(
+        self,
+        conv_layers: List[Tuple[int, int, int]],
+        dropout: float = 0.0,
+        mode: str = "default",
+        conv_bias: bool = False,
+    ):
+        super().__init__()
+
+        assert mode in {"default", "layer_norm"}
+
+        def block(
+            n_in,
+            n_out,
+            k,
+            stride,
+            is_layer_norm=False,
+            is_group_norm=False,
+            conv_bias=False,
+        ):
+            def make_conv():
+                conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
+                nn.init.kaiming_normal_(conv.weight)
+                return conv
+
+            assert (
+                is_layer_norm and is_group_norm
+            ) == False, "layer norm and group norm are exclusive"
+
+            if is_layer_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    nn.Sequential(
+                        TransposeLast(),
+                        Fp32LayerNorm(dim, elementwise_affine=True),
+                        TransposeLast(),
+                    ),
+                    nn.GELU(),
+                )
+            elif is_group_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    Fp32GroupNorm(dim, dim, affine=True),
+                    nn.GELU(),
+                )
+            else:
+                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for i, cl in enumerate(conv_layers):
+            assert len(cl) == 3, "invalid conv definition: " + str(cl)
+            (dim, k, stride) = cl
+
+            self.conv_layers.append(
+                block(
+                    in_d,
+                    dim,
+                    k,
+                    stride,
+                    is_layer_norm=mode == "layer_norm",
+                    is_group_norm=mode == "default" and i == 0,
+                    conv_bias=conv_bias,
+                )
+            )
+            in_d = dim
+
+    def forward(self, x):
+
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+
+        for conv in self.conv_layers:
+            x = conv(x)
+
+        return x
+
+
+def make_conv_pos(e, k, g):
+    pos_conv = nn.Conv1d(
+        e,
+        e,
+        kernel_size=k,
+        padding=k // 2,
+        groups=g,
+    )
+    dropout = 0
+    std = math.sqrt((4 * (1.0 - dropout)) / (k * e))
+    nn.init.normal_(pos_conv.weight, mean=0, std=std)
+    nn.init.constant_(pos_conv.bias, 0)
+
+    pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2)
+    pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU())
+
+    return pos_conv
+
+
+class TransformerEncoder(nn.Module):
+    def build_encoder_layer(self, args: Wav2Vec2Config):
+        if args.layer_type == "transformer":
+            layer = TransformerSentenceEncoderLayer(
+                embedding_dim=self.embedding_dim,
+                ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                num_attention_heads=args.encoder_attention_heads,
+                dropout=self.dropout,
+                attention_dropout=args.attention_dropout,
+                activation_dropout=args.activation_dropout,
+                activation_fn=args.activation_fn,
+                layer_norm_first=args.layer_norm_first,
+            )
+        elif args.layer_type == "conformer":
+            layer = ConformerWav2Vec2EncoderLayer(
+                embed_dim=self.embedding_dim,
+                ffn_embed_dim=args.encoder_ffn_embed_dim,
+                attention_heads=args.encoder_attention_heads,
+                dropout=args.dropout,
+                depthwise_conv_kernel_size=args.depthwise_conv_kernel_size,
+                activation_fn="swish",
+                attn_type=args.attn_type,
+                use_fp16=args.fp16,
+                pos_enc_type="abs",
+            )
+        layer = fsdp_wrap(layer)
+        if args.checkpoint_activations:
+            layer = checkpoint_wrapper(layer)
+        return layer
+
+    def __init__(self, args: Wav2Vec2Config):
+        super().__init__()
+
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.required_seq_len_multiple = args.required_seq_len_multiple
+
+        pos_conv_depth = getattr(args, "pos_conv_depth", 1)
+        if pos_conv_depth > 1:
+            num_layers = args.pos_conv_depth
+            k = max(3, args.conv_pos // num_layers)
+
+            def make_conv_block(e, k, g, l):
+                return nn.Sequential(
+                    *[
+                        nn.Sequential(
+                            nn.Conv1d(
+                                e,
+                                e,
+                                kernel_size=k,
+                                padding=k // 2,
+                                groups=g,
+                            ),
+                            SamePad(k),
+                            TransposeLast(),
+                            LayerNorm(e, elementwise_affine=False),
+                            TransposeLast(),
+                            nn.GELU(),
+                        )
+                        for _ in range(l)
+                    ]
+                )
+
+            self.pos_conv = make_conv_block(
+                self.embedding_dim, k, args.conv_pos_groups, num_layers
+            )
+
+        else:
+            self.pos_conv = make_conv_pos(
+                self.embedding_dim,
+                args.conv_pos,
+                args.conv_pos_groups,
+            )
+
+        self.layers = nn.ModuleList(
+            [self.build_encoder_layer(args) for _ in range(args.encoder_layers)]
+        )
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+
+        self.apply(init_bert_params)
+
+    def forward(self, x, padding_mask=None, layer=None):
+        x, layer_results = self.extract_features(x, padding_mask, layer)
+
+        if self.layer_norm_first and layer is None:
+            x = self.layer_norm(x)
+
+        return x, layer_results
+
+    def extract_features(
+        self,
+        x,
+        padding_mask=None,
+        tgt_layer=None,
+        min_layer=0,
+    ):
+
+        if padding_mask is not None:
+            x = index_put(x, padding_mask, 0)
+
+        x_conv = self.pos_conv(x.transpose(1, 2))
+        x_conv = x_conv.transpose(1, 2)
+        x = x + x_conv
+
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        # pad to the sequence length dimension
+        x, pad_length = pad_to_multiple(
+            x, self.required_seq_len_multiple, dim=-2, value=0
+        )
+        if pad_length > 0 and padding_mask is None:
+            padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
+            padding_mask[:, -pad_length:] = True
+        else:
+            padding_mask, _ = pad_to_multiple(
+                padding_mask, self.required_seq_len_multiple, dim=-1, value=True
+            )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        layer_results = []
+        r = None
+        for i, layer in enumerate(self.layers):
+            dropout_probability = np.random.random() if self.layerdrop > 0 else 1
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, (z, lr) = layer(
+                    x, self_attn_padding_mask=padding_mask, need_weights=False
+                )
+                if i >= min_layer:
+                    layer_results.append((x, z, lr))
+            if i == tgt_layer:
+                r = x
+                break
+
+        if r is not None:
+            x = r
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        # undo paddding
+        if pad_length > 0:
+            x = x[:, :-pad_length]
+
+            def undo_pad(a, b, c):
+                return (
+                    a[:-pad_length],
+                    b[:-pad_length] if b is not None else b,
+                    c[:-pad_length],
+                )
+
+            layer_results = [undo_pad(*u) for u in layer_results]
+
+        return x, layer_results
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+
+class ConformerEncoder(TransformerEncoder):
+    def build_encoder_layer(self, args):
+        layer = ConformerWav2Vec2EncoderLayer(
+            embed_dim=self.embedding_dim,
+            ffn_embed_dim=args.encoder_ffn_embed_dim,
+            attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            depthwise_conv_kernel_size=args.depthwise_conv_kernel_size,
+            activation_fn="swish",
+            attn_type=args.attn_type,
+            pos_enc_type=args.pos_enc_type,
+            use_fp16=args.fp16,  # only used for rope
+        )
+        layer = fsdp_wrap(layer)
+        if args.checkpoint_activations:
+            layer = checkpoint_wrapper(layer)
+        return layer
+
+    def __init__(self, args):
+        super().__init__(args)
+        self.args = args
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.pos_enc_type = args.pos_enc_type
+        max_source_positions = self.max_positions()
+
+        if self.pos_enc_type == "rel_pos":
+            self.embed_positions = RelPositionalEncoding(
+                max_source_positions, self.embedding_dim
+            )
+        elif self.pos_enc_type == "rope":
+            self.embed_positions = None
+        else:
+            raise Exception("Unsupported positional encoding type")
+
+        self.layers = nn.ModuleList(
+            [self.build_encoder_layer(args) for _ in range(args.encoder_layers)]
+        )
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+
+        self.apply(init_bert_params)
+
+    def extract_features(self, x, padding_mask=None, tgt_layer=None):
+        if padding_mask is not None:
+            x = index_put(x, padding_mask, 0)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # B X T X C here
+        position_emb = None
+        if self.pos_enc_type == "rel_pos":
+            position_emb = self.embed_positions(x)
+
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        layer_results = []
+        r = None
+        for i, layer in enumerate(self.layers):
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, z = layer(
+                    x,
+                    self_attn_padding_mask=padding_mask,
+                    need_weights=False,
+                    position_emb=position_emb,
+                )
+                if tgt_layer is not None:
+                    layer_results.append((x, z))
+            if i == tgt_layer:
+                r = x
+                break
+
+        if r is not None:
+            x = r
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        return x, layer_results
+
+
+class TransformerSentenceEncoderLayer(nn.Module):
+    """
+    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: float = 768,
+        ffn_embedding_dim: float = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        layer_norm_first: bool = False,
+    ) -> None:
+
+        super().__init__()
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+
+        # Initialize blocks
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+        )
+
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.layer_norm_first = layer_norm_first
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: torch.Tensor = None,
+        self_attn_padding_mask: torch.Tensor = None,
+        need_weights: bool = False,
+        att_args=None,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer imlementation.
+        """
+        residual = x
+
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                attn_mask=self_attn_mask,
+                need_weights=False,
+            )
+            x = self.dropout1(x)
+            x = residual + x
+
+            residual = x
+            x = self.final_layer_norm(x)
+            x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+
+            layer_result = x
+
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False,
+            )
+
+            x = self.dropout1(x)
+            x = residual + x
+
+            x = self.self_attn_layer_norm(x)
+
+            residual = x
+            x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+
+            layer_result = x
+
+            x = self.dropout3(x)
+            x = residual + x
+            x = self.final_layer_norm(x)
+
+        return x, (attn, layer_result)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
new file mode 100644
index 0000000000..bf261624e6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
@@ -0,0 +1,837 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import copy
+import logging
+import math
+import re
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from omegaconf import II, MISSING, open_dict
+
+from fairseq import checkpoint_utils, tasks, utils
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.models import (
+    BaseFairseqModel,
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+)
+from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES
+from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer
+from fairseq.tasks import FairseqTask
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Wav2Vec2AsrConfig(FairseqDataclass):
+    w2v_path: str = field(
+        default=MISSING, metadata={"help": "path to wav2vec 2.0 model"}
+    )
+    no_pretrained_weights: bool = field(
+        default=False, metadata={"help": "if true, does not load pretrained weights"}
+    )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"},
+    )
+
+    final_dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout after transformer and before final projection"},
+    )
+    dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability inside wav2vec 2.0 model"}
+    )
+    attention_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability for attention weights inside wav2vec 2.0 model"
+        },
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN inside wav2vec 2.0 model"
+        },
+    )
+
+    # masking
+    apply_mask: bool = field(
+        default=False, metadata={"help": "apply masking during fine-tuning"}
+    )
+    mask_length: int = field(
+        default=10, metadata={"help": "repeat the mask indices multiple times"}
+    )
+    mask_prob: float = field(
+        default=0.5,
+        metadata={
+            "help": "probability of replacing a token with mask (normalized by length)"
+        },
+    )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose masks"}
+    )
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument (used for more complex distributions), "
+            "see help in compute_mask_indices"
+        },
+    )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"}
+    )
+    mask_min_space: Optional[int] = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+    require_same_masks: bool = field(
+        default=True,
+        metadata={
+            "help": "whether to number of masked timesteps must be the same across all "
+            "examples in a batch"
+        },
+    )
+    mask_dropout: float = field(
+        default=0.0,
+        metadata={"help": "percent of masks to unmask for each sample"},
+    )
+
+    # channel masking
+    mask_channel_length: int = field(
+        default=10, metadata={"help": "length of the mask for features (channels)"}
+    )
+    mask_channel_prob: float = field(
+        default=0.0, metadata={"help": "probability of replacing a feature with 0"}
+    )
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"},
+    )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help": "secondary mask argument (used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        },
+    )
+    no_mask_channel_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow channel masks to overlap"}
+    )
+    freeze_finetune_updates: int = field(
+        default=0, metadata={"help": "dont finetune wav2vec for this many updates"}
+    )
+    feature_grad_mult: float = field(
+        default=0.0, metadata={"help": "reset feature grad mult in wav2vec 2.0 to this"}
+    )
+    layerdrop: float = field(
+        default=0.0, metadata={"help": "probability of dropping a layer in wav2vec 2.0"}
+    )
+    drop_path: float = 0
+    mask_channel_min_space: Optional[int] = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+    mask_channel_before: bool = False
+    normalize: bool = II("task.normalize")
+    update_alibi: bool = True
+    data: str = II("task.data")
+    # this holds the loaded wav2vec args
+    w2v_args: Any = None
+    offload_activations: bool = field(
+        default=False, metadata={"help": "offload_activations"}
+    )
+    min_params_to_wrap: int = field(
+        default=int(1e8),
+        metadata={
+            "help": "minimum number of params for a layer to be wrapped with FSDP() when "
+            "training with --ddp-backend=fully_sharded. Smaller values will "
+            "improve memory efficiency, but may make torch.distributed "
+            "communication less efficient due to smaller input sizes. This option "
+            "is set to 0 (i.e., always wrap) when --checkpoint-activations or "
+            "--offload-activations are passed."
+        },
+    )
+
+    checkpoint_activations: bool = field(
+        default=False,
+        metadata={"help": "recompute activations and save memory for extra compute"},
+    )
+    ddp_backend: str = II("distributed_training.ddp_backend")
+
+    zero_mask: bool = False
+    load_ema: bool = False
+
+    layer_decay: float = 1
+
+
+@dataclass
+class Wav2Vec2CtcConfig(Wav2Vec2AsrConfig):
+    blank_weight: float = 0
+    blank_mode: str = "add"
+
+
+@register_model("wav2vec_ctc", dataclass=Wav2Vec2CtcConfig)
+class Wav2VecCtc(BaseFairseqModel):
+    def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel):
+        super().__init__()
+        self.cfg = cfg
+        self.w2v_encoder = w2v_encoder
+        self.blank_weight = cfg.blank_weight
+        self.blank_mode = cfg.blank_mode
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+
+    @classmethod
+    def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask):
+        """Build a new model instance."""
+        w2v_encoder = Wav2VecEncoder(cfg, len(task.target_dictionary))
+        return cls(cfg, w2v_encoder)
+
+    def get_logits(self, net_output, normalize=False):
+        logits = net_output["encoder_out"]
+        if self.blank_weight != 0:
+            if self.blank_mode == "add":
+                logits[..., 0] += self.blank_weight
+            elif self.blank_mode == "set":
+                logits[..., 0] = self.blank_weight
+            else:
+                raise Exception(f"invalid blank mode {self.blank_mode}")
+
+        if net_output["padding_mask"] is not None and net_output["padding_mask"].any():
+            number_of_classes = logits.size(-1)
+            masking_tensor = torch.ones(
+                number_of_classes, device=logits.device
+            ) * float("-inf")
+            masking_tensor[0] = 0
+
+            if logits.size(0) > net_output["padding_mask"].size(1):
+                net_output["padding_mask"] = F.pad(
+                    net_output["padding_mask"], (1, 0), value=False
+                )
+
+            logits[net_output["padding_mask"].T] = masking_tensor.type_as(logits)
+
+        if normalize:
+            logits = utils.log_softmax(logits.float(), dim=-1)
+
+        return logits
+
+    def get_normalized_probs(self, net_output, log_probs):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        logits = self.get_logits(net_output)
+
+        if log_probs:
+            return utils.log_softmax(logits.float(), dim=-1)
+        else:
+            return utils.softmax(logits.float(), dim=-1)
+
+    def forward(self, **kwargs):
+        x = self.w2v_encoder(**kwargs)
+        return x
+
+
+@dataclass
+class Wav2Vec2Seq2SeqConfig(Wav2Vec2AsrConfig):
+    decoder_embed_dim: int = field(
+        default=768, metadata={"help": "decoder embedding dimension"}
+    )
+    decoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "decoder embedding dimension for FFN"}
+    )
+    decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"})
+    decoder_layerdrop: float = field(
+        default=0.0, metadata={"help": "decoder layerdrop chance"}
+    )
+    decoder_attention_heads: int = field(
+        default=4, metadata={"help": "num decoder attention heads"}
+    )
+    decoder_learned_pos: bool = field(
+        default=False,
+        metadata={"help": "use learned positional embeddings in the decoder"},
+    )
+    decoder_normalize_before: bool = field(
+        default=False, metadata={"help": "apply layernorm before each decoder block"}
+    )
+    no_token_positional_embeddings: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, disables positional embeddings (outside self attention)"
+        },
+    )
+    decoder_dropout: float = field(
+        default=0.0, metadata={"help": "dropout probability in the decoder"}
+    )
+    decoder_attention_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability for attention weights inside the decoder"
+        },
+    )
+    decoder_activation_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "dropout probability after activation in FFN inside the decoder"
+        },
+    )
+    max_target_positions: int = field(
+        default=2048, metadata={"help": "max target positions"}
+    )
+    share_decoder_input_output_embed: bool = field(
+        default=False, metadata={"help": "share decoder input and output embeddings"}
+    )
+    autoregressive: bool = II("task.autoregressive")
+
+
+@register_model("wav2vec_seq2seq", dataclass=Wav2Vec2Seq2SeqConfig)
+class Wav2Vec2Seq2SeqModel(FairseqEncoderDecoderModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def build_model(cls, cfg: Wav2Vec2Seq2SeqConfig, task: FairseqTask):
+        """Build a new model instance."""
+
+        assert (
+            cfg.autoregressive
+        ), "Please set task.autoregressive=true for seq2seq asr models"
+
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            return emb
+
+        decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim)
+
+        encoder = cls.build_encoder(cfg)
+        decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens)
+
+        return Wav2Vec2Seq2SeqModel(encoder, decoder)
+
+    @classmethod
+    def build_encoder(cls, cfg: Wav2Vec2AsrConfig):
+        return Wav2VecEncoder(cfg)
+
+    @classmethod
+    def build_decoder(cls, cfg: Wav2Vec2Seq2SeqConfig, tgt_dict, embed_tokens):
+        return TransformerDecoder(cfg, tgt_dict, embed_tokens)
+
+    def forward(self, **kwargs):
+        encoder_out = self.encoder(**kwargs)
+        decoder_out = self.decoder(encoder_out=encoder_out, **kwargs)
+        return decoder_out
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        return state_dict
+
+
+class Wav2VecEncoder(FairseqEncoder):
+    def __init__(self, cfg: Wav2Vec2AsrConfig, output_size=None):
+        self.apply_mask = cfg.apply_mask
+
+        arg_overrides = {
+            "dropout": cfg.dropout,
+            "activation_dropout": cfg.activation_dropout,
+            "dropout_input": cfg.dropout_input,
+            "attention_dropout": cfg.attention_dropout,
+            "mask_length": cfg.mask_length,
+            "mask_prob": cfg.mask_prob,
+            "require_same_masks": getattr(cfg, "require_same_masks", True),
+            "pct_holes": getattr(cfg, "mask_dropout", 0),
+            "mask_selection": cfg.mask_selection,
+            "mask_other": cfg.mask_other,
+            "no_mask_overlap": cfg.no_mask_overlap,
+            "mask_channel_length": cfg.mask_channel_length,
+            "mask_channel_prob": cfg.mask_channel_prob,
+            "mask_channel_before": cfg.mask_channel_before,
+            "mask_channel_selection": cfg.mask_channel_selection,
+            "mask_channel_other": cfg.mask_channel_other,
+            "no_mask_channel_overlap": cfg.no_mask_channel_overlap,
+            "encoder_layerdrop": cfg.layerdrop,
+            "feature_grad_mult": cfg.feature_grad_mult,
+            "checkpoint_activations": cfg.checkpoint_activations,
+            "offload_activations": cfg.offload_activations,
+            "min_params_to_wrap": cfg.min_params_to_wrap,
+            # d2v multi args
+            "encoder_dropout": cfg.dropout,
+            "drop_path": getattr(cfg, "drop_path", 0),
+            "mask_dropout": getattr(cfg, "mask_dropout", 0),
+            "zero_mask": getattr(cfg, "zero_mask", False),
+            "local_grad_mult": cfg.feature_grad_mult,
+            "layerdrop": cfg.layerdrop,
+            "prenet_layerdrop": cfg.layerdrop,
+            "prenet_dropout": cfg.dropout,
+            "post_mlp_drop": cfg.dropout,
+            "encoder_zero_mask": getattr(cfg, "zero_mask", False),
+            "inverse_mask": False,
+            "learned_alibi_scale": getattr(cfg, "update_alibi", True),
+        }
+
+        if cfg.w2v_args is None:
+            state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides)
+            w2v_args = state.get("cfg", None)
+            if w2v_args is None:
+                w2v_args = convert_namespace_to_omegaconf(state["args"])
+            w2v_args.criterion = None
+            w2v_args.lr_scheduler = None
+
+            cfg.w2v_args = w2v_args
+
+            logger.info(w2v_args)
+
+        else:
+            state = None
+            w2v_args = cfg.w2v_args
+            if isinstance(w2v_args, Namespace):
+                cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args)
+
+        self.is_d2v_multi = "data2vec_multi" in w2v_args.model.get("_name", None)
+
+        if not self.is_d2v_multi:
+            model_normalized = w2v_args.task.get(
+                "normalize", w2v_args.model.get("normalize", False)
+            )
+            assert cfg.normalize == model_normalized, (
+                "Fine-tuning works best when data normalization is the same. "
+                "Please check that --normalize is set or unset for both pre-training and here"
+            )
+
+            if hasattr(cfg, "checkpoint_activations") and cfg.checkpoint_activations:
+                with open_dict(w2v_args):
+                    w2v_args.model.checkpoint_activations = cfg.checkpoint_activations
+
+            w2v_args.task.data = cfg.data
+            task = tasks.setup_task(w2v_args.task, from_checkpoint=True)
+            model = task.build_model(w2v_args.model, from_checkpoint=True)
+
+            model.remove_pretraining_modules()
+            d = w2v_args.model.encoder_embed_dim
+        else:
+            assert cfg.normalize
+
+            if hasattr(w2v_args.task, "audio"):
+                w2v_args.task.audio.data = cfg.data
+            else:
+                w2v_args.task.data = cfg.data
+            task = tasks.setup_task(w2v_args.task, from_checkpoint=True)
+
+            model = task.build_model(w2v_args.model, from_checkpoint=True)
+
+            model.remove_pretraining_modules(modality="audio")
+            d = w2v_args.model.embed_dim
+
+        if state is not None and not cfg.no_pretrained_weights:
+            if cfg.load_ema:
+                assert "_ema" in state["model"]
+                for k in state["model"]["_ema"]:
+                    mk = "encoder." + k
+                    assert mk in state["model"], mk
+                    state["model"][mk] = state["model"]["_ema"][k]
+            self.load_model_weights(state, model, cfg)
+
+        super().__init__(task.source_dictionary)
+
+        self.w2v_model = model
+
+        self.final_dropout = nn.Dropout(cfg.final_dropout)
+        self.freeze_finetune_updates = cfg.freeze_finetune_updates
+        self.num_updates = 0
+
+        targ_d = None
+        self.proj = None
+
+        if output_size is not None:
+            targ_d = output_size
+        elif getattr(cfg, "decoder_embed_dim", d) != d:
+            targ_d = cfg.decoder_embed_dim
+
+        if targ_d is not None:
+            self.proj = Linear(d, targ_d)
+
+        layer_decay = getattr(cfg, "layer_decay", 1)
+        if layer_decay < 1:
+            mod_encs = list(model.modality_encoders.values())
+            assert len(mod_encs) == 1, len(mod_encs)
+            blocks = list(mod_encs[0].context_encoder.blocks) + list(model.blocks)
+            num_layers = len(blocks) + 1
+            layer_scales = list(
+                layer_decay ** (num_layers - i) for i in range(num_layers + 1)
+            )
+
+            for i, b in enumerate(blocks):
+                lid = i + 1
+                if layer_scales[lid] == 1.0:
+                    continue
+
+                for n, p in b.named_parameters():
+                    optim_override = getattr(p, "optim_overrides", {})
+                    if "optimizer" not in optim_override:
+                        optim_override["optimizer"] = {}
+
+                    optim_override["optimizer"]["lr_scale"] = layer_scales[lid]
+                    p.optim_overrides = optim_override
+
+    def load_model_weights(self, state, model, cfg):
+        if cfg.ddp_backend == "fully_sharded":
+            from fairseq.distributed import FullyShardedDataParallel
+
+            for name, module in model.named_modules():
+                if "encoder.layers" in name and len(name.split(".")) == 3:
+                    # Only for layers, we do a special handling and load the weights one by one
+                    # We dont load all weights together as that wont be memory efficient and may
+                    # cause oom
+                    new_dict = {
+                        k.replace(name + ".", ""): v
+                        for (k, v) in state["model"].items()
+                        if name + "." in k
+                    }
+                    assert isinstance(module, FullyShardedDataParallel)
+                    with module.summon_full_params():
+                        module.load_state_dict(new_dict, strict=True)
+                    module._reset_lazy_init()
+
+            # Once layers are loaded, filter them out and load everything else.
+            r = re.compile("encoder.layers.\d.")
+            filtered_list = list(filter(r.match, state["model"].keys()))
+
+            new_big_dict = {
+                k: v for (k, v) in state["model"].items() if k not in filtered_list
+            }
+
+            model.load_state_dict(new_big_dict, strict=False)
+        else:
+            to_delete = {"_ema", "target_proj", "decoder"}
+            for k in to_delete:
+                if k in state["model"]:
+                    del state["model"][k]
+
+            if hasattr(model, "modality_encoders"):
+                if "modality_encoders.AUDIO.encoder_mask" not in state["model"]:
+                    model.modality_encoders["AUDIO"].encoder_mask = None
+                elif not cfg.zero_mask:
+                    model.modality_encoders["AUDIO"].encoder_mask = None
+                    del state["model"]["modality_encoders.AUDIO.encoder_mask"]
+
+                for k in list(state["model"].keys()):
+                    if k.startswith("modality_encoders.") and not k.startswith(
+                        "modality_encoders.AUDIO"
+                    ):
+                        del state["model"][k]
+
+            print(model)
+            model.load_state_dict(state["model"], strict=True)
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        super().set_num_updates(num_updates)
+        self.num_updates = num_updates
+
+    def forward(self, source, padding_mask, **kwargs):
+
+        w2v_args = {
+            "source": source,
+            "padding_mask": padding_mask,
+            "mask": self.apply_mask and self.training,
+        }
+
+        if self.is_d2v_multi:
+            w2v_args["mode"] = "AUDIO"
+
+        ft = self.freeze_finetune_updates <= self.num_updates
+
+        with torch.no_grad() if not ft else contextlib.ExitStack():
+            res = self.w2v_model.extract_features(**w2v_args)
+
+            x = res["x"]
+            padding_mask = res["padding_mask"]
+
+            # B x T x C -> T x B x C
+            x = x.transpose(0, 1)
+
+        x = self.final_dropout(x)
+
+        if self.proj:
+            x = self.proj(x)
+
+        return {
+            "encoder_out": x,  # T x B x C
+            "padding_mask": padding_mask,  # B x T,
+            "layer_results": res["layer_results"],
+        }
+
+    def forward_torchscript(self, net_input):
+        if torch.jit.is_scripting():
+            return self.forward(net_input["source"], net_input["padding_mask"])
+        else:
+            return self.forward_non_torchscript(net_input)
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out["encoder_out"] is not None:
+            encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+                1, new_order
+            )
+        if encoder_out["padding_mask"] is not None:
+            encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select(
+                0, new_order
+            )
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return None
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self,
+        cfg: Wav2Vec2Seq2SeqConfig,
+        dictionary,
+        embed_tokens,
+        no_encoder_attn=False,
+    ):
+        super().__init__(dictionary)
+
+        self.dropout = cfg.decoder_dropout
+        self.share_input_output_embed = cfg.share_decoder_input_output_embed
+
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = cfg.decoder_embed_dim
+        self.output_embed_dim = cfg.decoder_embed_dim
+
+        self.layerdrop = cfg.decoder_layerdrop
+
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = cfg.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
+
+        self.project_in_dim = (
+            Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                cfg.max_target_positions,
+                embed_dim,
+                self.padding_idx,
+                learned=cfg.decoder_learned_pos,
+            )
+            if not cfg.no_token_positional_embeddings
+            else None
+        )
+
+        # TODO: update this when transformer gets converted to dataclass configs
+        transformer_cfg = copy.deepcopy(cfg)
+        with open_dict(transformer_cfg):
+            transformer_cfg.dropout = transformer_cfg.decoder_dropout
+            transformer_cfg.attention_dropout = (
+                transformer_cfg.decoder_attention_dropout
+            )
+            transformer_cfg.activation_dropout = (
+                transformer_cfg.decoder_activation_dropout
+            )
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                TransformerDecoderLayer(transformer_cfg, no_encoder_attn)
+                for _ in range(transformer_cfg.decoder_layers)
+            ]
+        )
+
+        if not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(
+                torch.Tensor(len(dictionary), self.output_embed_dim)
+            )
+            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5)
+
+        if transformer_cfg.decoder_normalize_before:
+            self.layer_norm = LayerNorm(embed_dim)
+        else:
+            self.layer_norm = None
+
+    def forward(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+
+        if type(prev_output_tokens) == list:
+            max_len = max((len(x) for x in prev_output_tokens))
+            tmp = torch.zeros(
+                [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device
+            )
+            for (i, p) in enumerate(prev_output_tokens):
+                tmp[i, : len(p)] = p
+            prev_output_tokens = tmp
+
+        prev_output_tokens = prev_output_tokens.long()
+        x, extra = self.extract_features(
+            prev_output_tokens, encoder_out, incremental_state
+        )
+        x = self.output_layer(x)
+        return x, extra
+
+    def extract_features(
+        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
+    ):
+        """
+        Similar to *forward* but only return features.
+
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+
+        # embed positions
+        positions = (
+            self.embed_positions(
+                prev_output_tokens, incremental_state=incremental_state
+            )
+            if self.embed_positions is not None
+            else None
+        )
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        attn = None
+
+        inner_states = [x]
+
+        # decoder layers
+        self_attn_padding_mask = None
+        if prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        for layer in self.layers:
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, attn, _ = layer(
+                    x,
+                    encoder_out["encoder_out"] if encoder_out is not None else None,
+                    encoder_out["padding_mask"] if encoder_out is not None else None,
+                    incremental_state,
+                    self_attn_mask=self.buffered_future_mask(x)
+                    if incremental_state is None
+                    else None,
+                    self_attn_padding_mask=self_attn_padding_mask,
+                )
+                inner_states.append(x)
+
+        if self.layer_norm:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        return x, {"attn": attn, "inner_states": inner_states}
+
+    def output_layer(self, features, **kwargs):
+        """Project features to the vocabulary size."""
+        # project back to size of vocabulary
+        if self.share_input_output_embed:
+            return F.linear(features, self.embed_tokens.weight)
+        else:
+            return F.linear(features, self.embed_out)
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions)
+
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        return state_dict
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
new file mode 100644
index 0000000000..ff89759d38
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec.wav2vec2_asr import (
+    Wav2Vec2CtcConfig,
+    Wav2VecCtc,
+    Wav2VecEncoder,
+)
+from fairseq.tasks import FairseqTask
+
+
+@register_model("wav2vec2_laser", dataclass=Wav2Vec2CtcConfig)
+class Wav2VecLaser(Wav2VecCtc):
+    def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel):
+        super().__init__(cfg, w2v_encoder)
+        self.num_updates = 0
+        self.freeze_finetune_updates = cfg.freeze_finetune_updates
+
+    @classmethod
+    def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask):
+        """Build a new model instance."""
+        w2v_encoder = Wav2VecEncoder(cfg, 1024)
+        return cls(cfg, w2v_encoder)
+
+    def forward(self, **kwargs):
+        output = super().forward(**kwargs)
+        x_out = output["encoder_out"] * 0.01
+        out_pad_mask = output["padding_mask"]
+        # Set padded outputs to -inf so they are not selected by max-pooling
+        if out_pad_mask is not None and out_pad_mask.any():
+            x_out = (
+                x_out.float()
+                .masked_fill_(out_pad_mask.T.unsqueeze(-1), float("-inf"))
+                .type_as(x_out)
+            )
+        return x_out.max(dim=0)[0]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
new file mode 100644
index 0000000000..bbf7694920
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import *  # noqa
+from .transformer_layer_xmod import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
new file mode 100644
index 0000000000..909bb423ca
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from fairseq.models.roberta.hub_interface import RobertaHubInterface
+import torch
+import torch.nn.functional as F
+
+
+class XMODHubInterface(RobertaHubInterface):
+    def extract_features(
+        self,
+        tokens: torch.LongTensor,
+        return_all_hiddens: bool = False,
+        lang_id=None,
+    ) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        if tokens.size(-1) > self.model.max_positions():
+            raise ValueError(
+                "tokens exceeds maximum length: {} > {}".format(
+                    tokens.size(-1), self.model.max_positions()
+                )
+            )
+        features, extra = self.model(
+            tokens.to(device=self.device),
+            features_only=True,
+            return_all_hiddens=return_all_hiddens,
+            lang_id=lang_id,
+        )
+        if return_all_hiddens:
+            # convert from T x B x C -> B x T x C
+            inner_states = extra["inner_states"]
+            return [inner_state.transpose(0, 1) for inner_state in inner_states]
+        else:
+            return features  # just the last layer's features
+
+    def predict(
+        self,
+        head: str,
+        tokens: torch.LongTensor,
+        return_logits: bool = False,
+        lang_id=None,
+    ):
+        features = self.extract_features(tokens.to(device=self.device), lang_id=lang_id)
+        logits = self.model.classification_heads[head](features)
+        if return_logits:
+            return logits
+        return F.log_softmax(logits, dim=-1)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
new file mode 100644
index 0000000000..fb6c7a8dea
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
@@ -0,0 +1,742 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ..roberta.model_xlmr import XLMRModel
+from fairseq.models.xmod.transformer_layer_xmod import XMODTransformerEncoderLayerBase
+from ..roberta.model import base_architecture, RobertaEncoder
+from fairseq.models.transformer import TransformerEncoder
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+from typing import Optional
+from fairseq.models.xmod.hub_interface import XMODHubInterface
+import torch
+from fairseq.distributed import fsdp_wrap
+from fairseq.models import (
+    register_model,
+    register_model_architecture,
+)
+
+from fairseq.modules.checkpoint_activations import checkpoint_wrapper
+
+DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8)
+
+
+@register_model("xmod")
+class XMODModel(XLMRModel):
+    @classmethod
+    def hub_models(cls):
+        return {
+            "xmod.base": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.81.1M.tar.gz",
+            "xmod.large.prenorm": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.large.prenorm.81.500k.tar.gz",
+            "xmod.base.13.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.13.125k.tar.gz",
+            "xmod.base.30.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.125k.tar.gz",
+            "xmod.base.30.195k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.195k.tar.gz",
+            "xmod.base.60.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.125k.tar.gz",
+            "xmod.base.60.265k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.265k.tar.gz",
+            "xmod.base.75.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.125k.tar.gz",
+            "xmod.base.75.269k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.269k.tar.gz",
+        }
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path,
+        checkpoint_file="model.pt",
+        data_name_or_path=".",
+        bpe="sentencepiece",
+        **kwargs,
+    ):
+        from fairseq import hub_utils
+
+        x = hub_utils.from_pretrained(
+            model_name_or_path,
+            checkpoint_file,
+            data_name_or_path,
+            archive_map=cls.hub_models(),
+            bpe=bpe,
+            load_checkpoint_heads=True,
+            **kwargs,
+        )
+        return XMODHubInterface(x["args"], x["task"], x["models"][0])
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        from omegaconf import OmegaConf
+
+        if OmegaConf.is_config(args):
+            OmegaConf.set_struct(args, False)
+
+        # make sure all arguments are present
+        base_architecture(args)
+
+        if not hasattr(args, "max_positions"):
+            if not hasattr(args, "tokens_per_sample"):
+                args.tokens_per_sample = task.max_positions()
+            args.max_positions = args.tokens_per_sample
+
+        encoder = XMODEncoder(args, task.source_dictionary)
+
+        if OmegaConf.is_config(args):
+            OmegaConf.set_struct(args, True)
+
+        return cls(args, encoder)
+
+    def forward(
+        self,
+        src_tokens,
+        features_only=False,
+        return_all_hiddens=False,
+        classification_head_name=None,
+        lang_id=None,
+        **kwargs,
+    ):
+        if classification_head_name is not None:
+            features_only = True
+        x, extra = self.encoder(
+            src_tokens, features_only, return_all_hiddens, lang_id=lang_id, **kwargs
+        )
+
+        if classification_head_name is not None:
+            x = self.classification_heads[classification_head_name](x)
+        return x, extra
+
+
+class XMODEncoder(RobertaEncoder):
+    """XMOD encoder."""
+
+    def build_encoder(self, args, dictionary, embed_tokens):
+        encoder = XMODTransformerEncoder(args, dictionary, embed_tokens)
+        encoder.apply(init_bert_params)
+        return encoder
+
+    def forward(
+        self,
+        src_tokens,
+        features_only=False,
+        return_all_hiddens=False,
+        masked_tokens=None,
+        lang_id=None,
+        **unused,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+            features_only (bool, optional): skip LM head and just return
+                features. If True, the output will be of shape
+                `(batch, src_len, embed_dim)`.
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+
+        Returns:
+            tuple:
+                - the LM output of shape `(batch, src_len, vocab)`
+                - a dictionary of additional data, where 'inner_states'
+                  is a list of hidden states. Note that the hidden
+                  states have shape `(src_len, batch, vocab)`.
+        """
+        x, extra = self.extract_features(
+            src_tokens, return_all_hiddens=return_all_hiddens, lang_id=lang_id
+        )
+        if not features_only:
+            x = self.output_layer(x, masked_tokens=masked_tokens)
+        return x, extra
+
+    def extract_features(
+        self, src_tokens, return_all_hiddens=False, lang_id=None, **kwargs
+    ):
+        encoder_out = self.sentence_encoder(
+            src_tokens,
+            return_all_hiddens=return_all_hiddens,
+            lang_id=lang_id,
+            token_embeddings=kwargs.get("token_embeddings", None),
+        )
+        # T x B x C -> B x T x C
+        features = encoder_out["encoder_out"][0].transpose(0, 1)
+        inner_states = encoder_out["encoder_states"] if return_all_hiddens else None
+        return features, {"inner_states": inner_states}
+
+
+class XMODTransformerEncoder(TransformerEncoder):
+    def build_encoder_layer(self, cfg):
+        layer = XMODTransformerEncoderLayerBase(cfg)
+        checkpoint = cfg.checkpoint_activations
+        if checkpoint:
+            offload_to_cpu = cfg.offload_activations
+            layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu)
+        # if we are checkpointing, enforce that FSDP always wraps the
+        # checkpointed layer, regardless of layer size
+        min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
+        layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
+        return layer
+
+    def forward(
+        self,
+        src_tokens,
+        src_lengths: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        lang_id=None,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        return self.forward_scriptable(
+            src_tokens,
+            src_lengths,
+            return_all_hiddens,
+            token_embeddings,
+            lang_id=lang_id,
+        )
+        # TorchScript doesn't support super() method so that the scriptable Subclass
+        # can't access the base class model in Torchscript.
+        # Current workaround is to add a helper function with different name and
+        # call the helper function from scriptable Subclass.
+
+    def forward_scriptable(
+        self,
+        src_tokens,
+        src_lengths: Optional[torch.Tensor] = None,
+        return_all_hiddens: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        lang_id=None,
+    ):
+        """
+        Args:
+            src_tokens (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+            token_embeddings (torch.Tensor, optional): precomputed embeddings
+                default `None` will recompute embeddings
+
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        has_pads = src_tokens.device.type == "xla" or encoder_padding_mask.any()
+
+        x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
+
+        # account for padding while computing the representation
+        if has_pads:
+            x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        encoder_states = []
+
+        if return_all_hiddens:
+            encoder_states.append(x)
+
+        # encoder layers
+        for layer in self.layers:
+            x = layer(
+                x,
+                encoder_padding_mask=encoder_padding_mask if has_pads else None,
+                lang_id=lang_id,
+            )
+            if return_all_hiddens:
+                assert encoder_states is not None
+                encoder_states.append(x)
+
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+
+        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
+        # `forward` so we use a dictionary instead.
+        # TorchScript does not support mixed values so the values are all lists.
+        # The empty list is equivalent to None.
+        src_lengths = (
+            src_tokens.ne(self.padding_idx)
+            .sum(dim=1, dtype=torch.int32)
+            .reshape(-1, 1)
+            .contiguous()
+        )
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask],  # B x T
+            "encoder_embedding": [encoder_embedding],  # B x T x C
+            "encoder_states": encoder_states,  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [src_lengths],
+        }
+
+
+@register_model_architecture("xmod", "xmod_base_13")
+def roberta_base_architecture(args):
+    args.ffn_modules = getattr(args, "ffn_modules", False)
+    args.adapter_modules = getattr(args, "adapter_modules", True)
+    args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False)
+    args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True)
+    args.ln_before_adapter = getattr(args, "ln_before_adapter", True)
+    args.languages = getattr(
+        args,
+        "languages",
+        [
+            "ar_AR",
+            "en_XX",
+            "fi_FI",
+            "fr_XX",
+            "hi_IN",
+            "id_ID",
+            "ka_GE",
+            "ko_KR",
+            "ru_RU",
+            "sw_KE",
+            "ta_IN",
+            "th_TH",
+            "vi_VN",
+        ],
+    )
+    base_architecture(args)
+
+
+@register_model_architecture("xmod", "xmod_base_30")
+def roberta_base_architecture(args):
+    args.ffn_modules = getattr(args, "ffn_modules", False)
+    args.adapter_modules = getattr(args, "adapter_modules", True)
+    args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False)
+    args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True)
+    args.ln_before_adapter = getattr(args, "ln_before_adapter", True)
+    args.languages = getattr(
+        args,
+        "languages",
+        [
+            "ar_AR",
+            "cs_CZ",
+            "en_XX",
+            "eu_ES",
+            "fi_FI",
+            "fr_XX",
+            "hi_IN",
+            "hr_HR",
+            "hu_HU",
+            "hy_AM",
+            "id_ID",
+            "it_IT",
+            "ka_GE",
+            "ko_KR",
+            "lt_LT",
+            "ml_IN",
+            "mn_MN",
+            "ms_MY",
+            "pl_PL",
+            "ro_RO",
+            "ru_RU",
+            "si_LK",
+            "sk_SK",
+            "sq_AL",
+            "sv_SE",
+            "sw_KE",
+            "ta_IN",
+            "th_TH",
+            "tl_XX",
+            "vi_VN",
+        ],
+    )
+    base_architecture(args)
+
+
+@register_model_architecture("xmod", "xmod_base_60")
+def roberta_base_architecture(args):
+    args.ffn_modules = getattr(args, "ffn_modules", False)
+    args.adapter_modules = getattr(args, "adapter_modules", True)
+    args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False)
+    args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True)
+    args.ln_before_adapter = getattr(args, "ln_before_adapter", True)
+    args.languages = getattr(
+        args,
+        "languages",
+        [
+            "af_ZA",
+            "am_ET",
+            "ar_AR",
+            "be_BY",
+            "bn_IN",
+            "ca_ES",
+            "cs_CZ",
+            "cy_GB",
+            "da_DK",
+            "en_XX",
+            "eo_EO",
+            "et_EE",
+            "eu_ES",
+            "fa_IR",
+            "fi_FI",
+            "fr_XX",
+            "ga_IE",
+            "gl_ES",
+            "gu_IN",
+            "ha_NG",
+            "hi_IN",
+            "hr_HR",
+            "hu_HU",
+            "hy_AM",
+            "id_ID",
+            "is_IS",
+            "it_IT",
+            "ka_GE",
+            "ko_KR",
+            "ku_TR",
+            "la_VA",
+            "lt_LT",
+            "lv_LV",
+            "mk_MK",
+            "ml_IN",
+            "mn_MN",
+            "ms_MY",
+            "ne_NP",
+            "nl_XX",
+            "no_XX",
+            "pl_PL",
+            "ps_AF",
+            "pt_XX",
+            "ro_RO",
+            "ru_RU",
+            "sa_IN",
+            "sd_PK",
+            "si_LK",
+            "sk_SK",
+            "sl_SI",
+            "so_SO",
+            "sq_AL",
+            "sr_RS",
+            "sv_SE",
+            "sw_KE",
+            "ta_IN",
+            "te_IN",
+            "th_TH",
+            "tl_XX",
+            "vi_VN",
+        ],
+    )
+    base_architecture(args)
+
+
+@register_model_architecture("xmod", "xmod_base_75")
+def roberta_base_architecture(args):
+    args.ffn_modules = getattr(args, "ffn_modules", False)
+    args.adapter_modules = getattr(args, "adapter_modules", True)
+    args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False)
+    args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True)
+    args.ln_before_adapter = getattr(args, "ln_before_adapter", True)
+    args.languages = getattr(
+        args,
+        "languages",
+        [
+            "af_ZA",
+            "am_ET",
+            "ar_AR",
+            "as_IN",
+            "be_BY",
+            "bn_IN",
+            "br_FR",
+            "bs_BA",
+            "ca_ES",
+            "cs_CZ",
+            "cy_GB",
+            "da_DK",
+            "en_XX",
+            "eo_EO",
+            "et_EE",
+            "eu_ES",
+            "fa_IR",
+            "fi_FI",
+            "fr_XX",
+            "fy_NL",
+            "ga_IE",
+            "gd_GB",
+            "gl_ES",
+            "gu_IN",
+            "ha_NG",
+            "hi_IN",
+            "hr_HR",
+            "hu_HU",
+            "hy_AM",
+            "id_ID",
+            "is_IS",
+            "it_IT",
+            "jv_ID",
+            "ka_GE",
+            "kn_IN",
+            "ko_KR",
+            "ku_TR",
+            "la_VA",
+            "lt_LT",
+            "lv_LV",
+            "mg_MG",
+            "mk_MK",
+            "ml_IN",
+            "mn_MN",
+            "mr_IN",
+            "ms_MY",
+            "ne_NP",
+            "nl_XX",
+            "no_XX",
+            "om_KE",
+            "or_IN",
+            "pa_IN",
+            "pl_PL",
+            "ps_AF",
+            "pt_XX",
+            "ro_RO",
+            "ru_RU",
+            "sa_IN",
+            "sd_PK",
+            "si_LK",
+            "sk_SK",
+            "sl_SI",
+            "so_SO",
+            "sq_AL",
+            "sr_RS",
+            "su_ID",
+            "sv_SE",
+            "sw_KE",
+            "ta_IN",
+            "te_IN",
+            "th_TH",
+            "tl_XX",
+            "vi_VN",
+            "xh_ZA",
+            "yi_DE",
+        ],
+    )
+    base_architecture(args)
+
+
+@register_model_architecture("xmod", "xmod_base")
+def roberta_base_architecture(args):
+    args.ffn_modules = getattr(args, "ffn_modules", False)
+    args.adapter_modules = getattr(args, "adapter_modules", True)
+    args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False)
+    args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True)
+    args.ln_before_adapter = getattr(args, "ln_before_adapter", True)
+    args.languages = getattr(
+        args,
+        "languages",
+        [
+            "en_XX",
+            "id_ID",
+            "vi_VN",
+            "ru_RU",
+            "fa_IR",
+            "sv_SE",
+            "ja_XX",
+            "fr_XX",
+            "de_DE",
+            "ro_RO",
+            "ko_KR",
+            "hu_HU",
+            "es_XX",
+            "fi_FI",
+            "uk_UA",
+            "da_DK",
+            "pt_XX",
+            "no_XX",
+            "th_TH",
+            "pl_PL",
+            "bg_BG",
+            "nl_XX",
+            "zh_CN",
+            "he_IL",
+            "el_GR",
+            "it_IT",
+            "sk_SK",
+            "hr_HR",
+            "tr_TR",
+            "ar_AR",
+            "cs_CZ",
+            "lt_LT",
+            "hi_IN",
+            "zh_TW",
+            "ca_ES",
+            "ms_MY",
+            "sl_SI",
+            "lv_LV",
+            "ta_IN",
+            "bn_IN",
+            "et_EE",
+            "az_AZ",
+            "sq_AL",
+            "sr_RS",
+            "kk_KZ",
+            "ka_GE",
+            "tl_XX",
+            "ur_PK",
+            "is_IS",
+            "hy_AM",
+            "ml_IN",
+            "mk_MK",
+            "be_BY",
+            "la_VA",
+            "te_IN",
+            "eu_ES",
+            "gl_ES",
+            "mn_MN",
+            "kn_IN",
+            "ne_NP",
+            "sw_KE",
+            "si_LK",
+            "mr_IN",
+            "af_ZA",
+            "gu_IN",
+            "cy_GB",
+            "eo_EO",
+            "km_KH",
+            "ky_KG",
+            "uz_UZ",
+            "ps_AF",
+            "pa_IN",
+            "ga_IE",
+            "ha_NG",
+            "am_ET",
+            "lo_LA",
+            "ku_TR",
+            "so_SO",
+            "my_MM",
+            "or_IN",
+            "sa_IN",
+        ],
+    )
+    base_architecture(args)
+
+
+@register_model_architecture("xmod", "xmod_large_prenorm")
+def roberta_base_architecture(args):
+    args.ffn_modules = getattr(args, "ffn_modules", False)
+    args.adapter_modules = getattr(args, "adapter_modules", True)
+    args.adapter_layer_norm = getattr(args, "adapter_layer_norm", True)
+    args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", False)
+    args.ln_before_adapter = getattr(args, "ln_before_adapter", False)
+    # args.bottleneck = getattr(args, "bottleneck", 8)
+    args.bottleneck = getattr(args, "bottleneck", 4)
+    args.languages = getattr(
+        args,
+        "languages",
+        [
+            "en_XX",
+            "id_ID",
+            "vi_VN",
+            "ru_RU",
+            "fa_IR",
+            "sv_SE",
+            "ja_XX",
+            "fr_XX",
+            "de_DE",
+            "ro_RO",
+            "ko_KR",
+            "hu_HU",
+            "es_XX",
+            "fi_FI",
+            "uk_UA",
+            "da_DK",
+            "pt_XX",
+            "no_XX",
+            "th_TH",
+            "pl_PL",
+            "bg_BG",
+            "nl_XX",
+            "zh_CN",
+            "he_IL",
+            "el_GR",
+            "it_IT",
+            "sk_SK",
+            "hr_HR",
+            "tr_TR",
+            "ar_AR",
+            "cs_CZ",
+            "lt_LT",
+            "hi_IN",
+            "zh_TW",
+            "ca_ES",
+            "ms_MY",
+            "sl_SI",
+            "lv_LV",
+            "ta_IN",
+            "bn_IN",
+            "et_EE",
+            "az_AZ",
+            "sq_AL",
+            "sr_RS",
+            "kk_KZ",
+            "ka_GE",
+            "tl_XX",
+            "ur_PK",
+            "is_IS",
+            "hy_AM",
+            "ml_IN",
+            "mk_MK",
+            "be_BY",
+            "la_VA",
+            "te_IN",
+            "eu_ES",
+            "gl_ES",
+            "mn_MN",
+            "kn_IN",
+            "ne_NP",
+            "sw_KE",
+            "si_LK",
+            "mr_IN",
+            "af_ZA",
+            "gu_IN",
+            "cy_GB",
+            "eo_EO",
+            "km_KH",
+            "ky_KG",
+            "uz_UZ",
+            "ps_AF",
+            "pa_IN",
+            "ga_IE",
+            "ha_NG",
+            "am_ET",
+            "lo_LA",
+            "ku_TR",
+            "so_SO",
+            "my_MM",
+            "or_IN",
+            "sa_IN",
+        ],
+    )
+
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.encoder_layers = getattr(args, "encoder_layers", 24)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    base_architecture(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
new file mode 100644
index 0000000000..47a91cdc23
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
@@ -0,0 +1,179 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.modules.transformer_layer import TransformerEncoderLayer
+from typing import Optional
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor
+
+
+class Adapter(nn.Module):
+    def __init__(self, cfg, red_fac=2):
+        super(Adapter, self).__init__()
+        self.cfg = cfg
+        self.embed_dim = cfg.encoder_embed_dim
+        self.quant_noise = getattr(cfg, "quant_noise_pq", 0)
+        self.quant_noise_block_size = getattr(cfg, "quant_noise_pq_block_size", 8) or 8
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(cfg, "activation_fn", "relu") or "relu"
+        )
+        self.fc1 = quant_noise(
+            nn.Linear(self.embed_dim, self.embed_dim // red_fac),
+            p=self.quant_noise,
+            block_size=self.quant_noise_block_size,
+        )
+        self.fc2 = quant_noise(
+            nn.Linear(self.embed_dim // red_fac, self.embed_dim),
+            p=self.quant_noise,
+            block_size=self.quant_noise_block_size,
+        )
+        activation_dropout_p = getattr(cfg, "activation_dropout", 0) or 0
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use cfg.relu_dropout
+            activation_dropout_p = getattr(cfg, "relu_dropout", 0) or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+
+    def forward(self, x):
+        x = self.activation_fn(self.fc1(x))
+        if not hasattr(self.cfg, "adapter_dropout") or self.cfg.adapter_dropout:
+            x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        return x
+
+
+class XMODTransformerEncoderLayerBase(TransformerEncoderLayer):
+    """Encoder layer block.
+
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *cfg.encoder.normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        if hasattr(cfg, "adapter_modules") and cfg.adapter_modules:
+            export = getattr(cfg, "export", False)
+            if cfg.adapter_layer_norm:
+                self.adapter_layer_norm = LayerNorm(self.embed_dim, export=export)
+            self.adapter_modules = nn.ModuleDict(dict())
+            if hasattr(self.cfg, "bottleneck"):
+                bottleneck = self.cfg.bottleneck
+            else:
+                bottleneck = 2
+            for language in cfg.languages:
+                self.adapter_modules[str(language)] = Adapter(cfg, red_fac=bottleneck)
+
+    def lang_adapter(self, lang_id, x):
+        # If language adapters exist pass throught them
+        if hasattr(self.cfg, "adapter_modules") and self.cfg.adapter_modules:
+            if lang_id is None:
+                lang_id = ["en_XX"] * x.shape[1]
+            d_langs = [lang_id[0]]
+            lang_lengths = [1]
+            for lang in lang_id[1:]:
+                if lang == d_langs[-1]:
+                    lang_lengths[-1] += 1
+                else:
+                    d_langs.append(lang)
+                    lang_lengths.append(1)
+
+            if (
+                not hasattr(self.cfg, "ln_before_adapter")
+                or not self.cfg.ln_before_adapter
+            ):
+                residual = x
+            if self.cfg.adapter_layer_norm:
+                x = self.adapter_layer_norm(x)
+            elif self.cfg.adapter_reuse_layer_norm:
+                x = self.final_layer_norm(x)
+            if hasattr(self.cfg, "ln_before_adapter") and self.cfg.ln_before_adapter:
+                residual = x
+
+            split_x = torch.split(x, lang_lengths, 1)
+            x_ = []
+            for i, (lang, s_x) in enumerate(zip(d_langs, split_x)):
+                lang = lang.replace("_rom", "").replace("_zaw", "")
+                x_.append(self.adapter_modules[str(lang)](s_x))
+            x = torch.cat(x_, 1)
+
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+
+        return x
+
+    def forward(
+        self,
+        x,
+        encoder_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor] = None,
+        lang_id: Optional[list] = None,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, seq_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
+                where `tgt_len` is the length of output and `src_len` is the
+                length of input, though here both are equal to `seq_len`.
+                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
+                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
+                useful for strided self-attention.
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask,
+            need_weights=False,
+            attn_mask=attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+
+        x = self.lang_adapter(lang_id, x)
+
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
new file mode 100644
index 0000000000..dcfda9b82a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
@@ -0,0 +1,106 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+from .adaptive_input import AdaptiveInput
+from .adaptive_softmax import AdaptiveSoftmax
+from .base_layer import BaseLayer
+from .beamable_mm import BeamableMM
+from .character_token_embedder import CharacterTokenEmbedder
+from .conv_tbc import ConvTBC
+from .cross_entropy import cross_entropy
+from .downsampled_multihead_attention import DownsampledMultiHeadAttention
+from .dynamic_convolution import DynamicConv, DynamicConv1dTBC, DynamicConv_scripatable
+from .dynamic_crf_layer import DynamicCRF
+from .ema_module import EMAModuleConfig, EMAModule
+from .fairseq_dropout import FairseqDropout
+from .fp32_batch_norm import Fp32BatchNorm
+from .fp32_group_norm import Fp32GroupNorm
+from .fp32_instance_norm import Fp32InstanceNorm
+from .gelu import gelu, gelu_accurate
+from .grad_multiply import GradMultiply
+from .gumbel_vector_quantizer import GumbelVectorQuantizer
+from .kmeans_vector_quantizer import KmeansVectorQuantizer
+from .layer_drop import LayerDropModuleList
+from .layer_norm import Fp32LayerNorm, LayerNorm
+from .learned_positional_embedding import LearnedPositionalEmbedding
+from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
+from .linearized_convolution import LinearizedConvolution
+from .location_attention import LocationAttention
+from .lstm_cell_with_zoneout import LSTMCellWithZoneOut
+from .multihead_attention import MultiheadAttention
+from .positional_embedding import PositionalEmbedding
+from .same_pad import SamePad, SamePad2d
+from .scalar_bias import ScalarBias
+from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
+from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer
+from .transformer_sentence_encoder import TransformerSentenceEncoder
+from .transpose_last import TransposeLast
+from .unfold import unfold1d
+from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
+from .vggblock import VGGBlock
+from .espnet_multihead_attention import (
+    ESPNETMultiHeadedAttention,
+    RelPositionMultiHeadedAttention,
+    RotaryPositionMultiHeadedAttention,
+)
+from .rotary_positional_embedding import RotaryPositionalEmbedding
+from .positional_encoding import (
+    RelPositionalEncoding,
+)
+
+__all__ = [
+    "AdaptiveInput",
+    "AdaptiveSoftmax",
+    "BaseLayer",
+    "BeamableMM",
+    "CharacterTokenEmbedder",
+    "ConvTBC",
+    "cross_entropy",
+    "DownsampledMultiHeadAttention",
+    "DynamicConv1dTBC",
+    "DynamicConv",
+    "DynamicConv_scripatable",
+    "DynamicCRF",
+    "EMAModule",
+    "EMAModuleConfig",
+    "FairseqDropout",
+    "Fp32BatchNorm",
+    "Fp32GroupNorm",
+    "Fp32LayerNorm",
+    "Fp32InstanceNorm",
+    "gelu",
+    "gelu_accurate",
+    "GradMultiply",
+    "GumbelVectorQuantizer",
+    "KmeansVectorQuantizer",
+    "LayerDropModuleList",
+    "LayerNorm",
+    "LearnedPositionalEmbedding",
+    "LightweightConv1dTBC",
+    "LightweightConv",
+    "LinearizedConvolution",
+    "LocationAttention",
+    "LSTMCellWithZoneOut",
+    "MultiheadAttention",
+    "PositionalEmbedding",
+    "SamePad",
+    "SamePad2d",
+    "ScalarBias",
+    "SinusoidalPositionalEmbedding",
+    "TransformerSentenceEncoderLayer",
+    "TransformerSentenceEncoder",
+    "TransformerDecoderLayer",
+    "TransformerEncoderLayer",
+    "TransposeLast",
+    "VGGBlock",
+    "unfold1d",
+    "ESPNETMultiheadedAttention",
+    "PositionalEmbedding",
+    "RelPositionMultiHeadedAttention",
+    "RelPositionalEncoding",
+    "RotaryPositionalEmbedding",
+    "RotaryPositionMultiHeadedAttention",
+]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
new file mode 100644
index 0000000000..01ac4accac
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import List
+
+import torch
+from torch import nn
+
+from fairseq.modules.quant_noise import quant_noise
+
+
+class AdaptiveInput(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        padding_idx: int,
+        initial_dim: int,
+        factor: float,
+        output_dim: int,
+        cutoff: List[int],
+        q_noise: float = 0,
+        qn_block_size: int = 8,
+    ):
+        super().__init__()
+
+        if vocab_size > cutoff[-1]:
+            cutoff = cutoff + [vocab_size]
+        else:
+            assert (
+                vocab_size == cutoff[-1]
+            ), "cannot specify cutoff larger than vocab size"
+
+        self.cutoff = cutoff
+        self.embedding_dim = output_dim
+        self.padding_idx = padding_idx
+
+        self.embeddings = nn.ModuleList()
+        for i in range(len(self.cutoff)):
+            prev = self.cutoff[i - 1] if i > 0 else 0
+            size = self.cutoff[i] - prev
+            dim = int(initial_dim // (factor**i))
+            seq = nn.Sequential(
+                nn.Embedding(size, dim, self.padding_idx),
+                quant_noise(
+                    nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size
+                ),
+            )
+
+            self.embeddings.append(seq)
+            self.padding_idx = None
+        self.padding_idx = padding_idx
+
+        def init_weights(m):
+            if isinstance(m, nn.Embedding):
+                nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5)
+                nn.init.constant_(m.weight[padding_idx], 0)
+            elif hasattr(m, "weight"):
+                nn.init.xavier_uniform_(m.weight)
+
+        self.apply(init_weights)
+
+        self.register_buffer("_float_tensor", torch.FloatTensor(1))
+
+    def weights_for_band(self, band: int):
+        return self.embeddings[band][0].weight, self.embeddings[band][1].weight
+
+    def forward(self, input: torch.Tensor):
+        result = self._float_tensor.new(input.shape + (self.embedding_dim,))
+        for i in range(len(self.cutoff)):
+            mask = input.lt(self.cutoff[i])
+            if i > 0:
+                mask.mul_(input.ge(self.cutoff[i - 1]))
+                chunk_input = input[mask] - self.cutoff[i - 1]
+            else:
+                chunk_input = input[mask]
+            if mask.any():
+                result[mask] = self.embeddings[i](chunk_input)
+        return result
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
new file mode 100644
index 0000000000..ae0c77ba0f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import operator
+
+import torch
+import torch.nn.functional as F
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import nn
+
+
+class TiedLinear(nn.Module):
+    def __init__(self, weight, transpose):
+        super().__init__()
+        self.weight = weight
+        self.transpose = transpose
+
+    def forward(self, input):
+        return F.linear(input, self.weight.t() if self.transpose else self.weight)
+
+
+class TiedHeadModule(nn.Module):
+    def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size):
+        super().__init__()
+        tied_emb, _ = weights
+        self.num_words, emb_dim = tied_emb.size()
+
+        self.word_proj = quant_noise(
+            TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size
+        )
+        if input_dim != emb_dim:
+            self.word_proj = nn.Sequential(
+                quant_noise(
+                    nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size
+                ),
+                self.word_proj,
+            )
+
+        self.class_proj = quant_noise(
+            nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size
+        )
+        self.out_dim = self.num_words + num_classes
+
+        self.register_buffer("_float_tensor", torch.FloatTensor(1))
+
+    def forward(self, input):
+        inp_sz = functools.reduce(operator.mul, input.shape[:-1], 1)
+        out = self._float_tensor.new(inp_sz, self.out_dim)
+        out[:, : self.num_words] = self.word_proj(input.view(inp_sz, -1))
+        out[:, self.num_words :] = self.class_proj(input.view(inp_sz, -1))
+        return out
+
+
+class AdaptiveSoftmax(nn.Module):
+    """
+    This is an implementation of the efficient softmax approximation for
+    graphical processing units (GPU), described in the paper "Efficient softmax
+    approximation for GPUs" (http://arxiv.org/abs/1609.04309).
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        input_dim,
+        cutoff,
+        dropout,
+        factor=4.0,
+        adaptive_inputs=None,
+        tie_proj=False,
+        q_noise=0,
+        qn_block_size=8,
+    ):
+        super().__init__()
+
+        if vocab_size > cutoff[-1]:
+            cutoff = cutoff + [vocab_size]
+        else:
+            assert (
+                vocab_size == cutoff[-1]
+            ), "cannot specify cutoff larger than vocab size"
+
+        output_dim = cutoff[0] + len(cutoff) - 1
+
+        self.vocab_size = vocab_size
+        self.cutoff = cutoff
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.input_dim = input_dim
+        self.factor = factor
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+
+        self.lsm = nn.LogSoftmax(dim=1)
+
+        if adaptive_inputs is not None:
+            self.head = TiedHeadModule(
+                adaptive_inputs.weights_for_band(0),
+                input_dim,
+                len(cutoff) - 1,
+                self.q_noise,
+                self.qn_block_size,
+            )
+        else:
+            self.head = quant_noise(
+                nn.Linear(input_dim, output_dim, bias=False),
+                self.q_noise,
+                self.qn_block_size,
+            )
+
+        self._make_tail(adaptive_inputs, tie_proj)
+
+        def init_weights(m):
+            if (
+                hasattr(m, "weight")
+                and not isinstance(m, TiedLinear)
+                and not isinstance(m, TiedHeadModule)
+            ):
+                nn.init.xavier_uniform_(m.weight)
+
+        self.apply(init_weights)
+
+        self.register_buffer("version", torch.LongTensor([1]))
+
+    def _make_tail(self, adaptive_inputs=None, tie_proj=False):
+        self.tail = nn.ModuleList()
+        for i in range(len(self.cutoff) - 1):
+            dim = int(self.input_dim // self.factor ** (i + 1))
+
+            tied_emb, tied_proj = (
+                adaptive_inputs.weights_for_band(i + 1)
+                if adaptive_inputs is not None
+                else (None, None)
+            )
+
+            if tied_proj is not None:
+                if tie_proj:
+                    proj = quant_noise(
+                        TiedLinear(tied_proj, transpose=True),
+                        self.q_noise,
+                        self.qn_block_size,
+                    )
+                else:
+                    proj = quant_noise(
+                        nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False),
+                        self.q_noise,
+                        self.qn_block_size,
+                    )
+            else:
+                proj = quant_noise(
+                    nn.Linear(self.input_dim, dim, bias=False),
+                    self.q_noise,
+                    self.qn_block_size,
+                )
+
+            if tied_emb is None:
+                out_proj = nn.Linear(
+                    dim, self.cutoff[i + 1] - self.cutoff[i], bias=False
+                )
+            else:
+                out_proj = TiedLinear(tied_emb, transpose=False)
+
+            m = nn.Sequential(
+                proj,
+                nn.Dropout(self.dropout_module.p),
+                quant_noise(out_proj, self.q_noise, self.qn_block_size),
+            )
+
+            self.tail.append(m)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        version_name = name + ".version"
+        if version_name not in state_dict:
+            raise Exception("This version of the model is no longer supported")
+
+    def adapt_target(self, target):
+        """
+        In order to be efficient, the AdaptiveSoftMax does not compute the
+        scores for all the word of the vocabulary for all the examples. It is
+        thus necessary to call the method adapt_target of the AdaptiveSoftMax
+        layer inside each forward pass.
+        """
+
+        target = target.view(-1)
+        new_target = [target.clone()]
+        target_idxs = []
+
+        for i in range(len(self.cutoff) - 1):
+            mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1]))
+            new_target[0][mask] = self.cutoff[0] + i
+
+            if mask.any():
+                target_idxs.append(mask.nonzero(as_tuple=False).squeeze(1))
+                new_target.append(target[mask].add(-self.cutoff[i]))
+            else:
+                target_idxs.append(None)
+                new_target.append(None)
+
+        return new_target, target_idxs
+
+    def forward(self, input, target):
+        """
+        Args:
+            input: (b x t x d)
+            target: (b x t)
+        Returns:
+            2 lists: output for each cutoff section and new targets by cut off
+        """
+
+        input = input.contiguous().view(-1, input.size(-1))
+        input = self.dropout_module(input)
+
+        new_target, target_idxs = self.adapt_target(target)
+        output = [self.head(input)]
+
+        for i in range(len(target_idxs)):
+            if target_idxs[i] is not None:
+                output.append(self.tail[i](input.index_select(0, target_idxs[i])))
+            else:
+                output.append(None)
+
+        return output, new_target
+
+    def get_log_prob(self, input, target):
+        """
+        Computes the log probabilities for all the words of the vocabulary,
+        given a 2D tensor of hidden vectors.
+        """
+
+        bsz, length, dim = input.size()
+        input = input.contiguous().view(-1, dim)
+
+        if target is not None:
+            _, target_idxs = self.adapt_target(target)
+        else:
+            target_idxs = None
+
+        head_y = self.head(input)
+        log_probs = head_y.new_zeros(input.size(0), self.vocab_size)
+
+        head_sz = self.cutoff[0] + len(self.tail)
+        log_probs[:, :head_sz] = self.lsm(head_y)
+        tail_priors = log_probs[:, self.cutoff[0] : head_sz].clone()
+
+        for i in range(len(self.tail)):
+            start = self.cutoff[i]
+            end = self.cutoff[i + 1]
+
+            if target_idxs is None:
+                tail_out = log_probs[:, start:end]
+                tail_out.copy_(self.tail[i](input))
+                log_probs[:, start:end] = self.lsm(tail_out).add_(
+                    tail_priors[:, i, None]
+                )
+            elif target_idxs[i] is not None:
+                idxs = target_idxs[i]
+                tail_out = log_probs[idxs, start:end]
+                tail_out.copy_(self.tail[i](input[idxs]))
+                log_probs[idxs, start:end] = self.lsm(tail_out).add_(
+                    tail_priors[idxs, i, None]
+                )
+
+        log_probs = log_probs.view(bsz, length, -1)
+        return log_probs
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
new file mode 100644
index 0000000000..e823f7bae2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+import torch
+import sys
+from fairseq import utils
+from fairseq.distributed import utils as distributed_utils
+from fairseq.modules.layer_norm import LayerNorm
+
+
+class BaseLayer(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.num_workers = distributed_utils.get_data_parallel_world_size()
+        expert_centroids = torch.empty(self.num_workers, args.decoder_embed_dim)
+        torch.nn.init.orthogonal_(expert_centroids, gain=0.1)
+        self.register_parameter(
+            "expert_centroids", torch.nn.Parameter(expert_centroids)
+        )
+        self.expert_network = nn.Sequential(
+            *([BaseSublayer(args) for _ in range(args.base_sublayers)])
+        )
+        self.expert_id = distributed_utils.get_data_parallel_rank()
+        self.shuffle = args.base_shuffle
+        self.cpp = self.load_assignment()
+
+        # Add a special attribute to the expert parameters, so we know not to sync their gradients
+        for param in self.expert_network.parameters():
+            param.expert = True
+
+    def forward(self, input_features, *args, **kwargs):
+        features = input_features.reshape(-1, input_features.size(-1))
+        is_training = input_features.requires_grad
+
+        if self.shuffle and is_training:
+            # Send each token to a random worker, to break correlations within the batch
+            shuffle_sort = torch.randperm(features.size(0), device=features.device)
+            features = All2All.apply(features[shuffle_sort])
+
+        with torch.no_grad():
+            # Compute similarity of each token to each expert, for routing
+            token_expert_affinities = features.matmul(
+                self.expert_centroids.transpose(0, 1)
+            )
+
+        # Compute which token goes to which expert
+        sort_by_expert, input_splits, output_splits = (
+            self.balanced_assignment(token_expert_affinities)
+            if is_training
+            else self.greedy_assignment(token_expert_affinities)
+        )
+        # Swap these tokens for the right ones for our expert
+        routed_features = All2All.apply(
+            features[sort_by_expert], output_splits, input_splits
+        )
+
+        if routed_features.size(0) > 0:
+            # Mix in the expert network based on how appropriate it is for these tokens
+            alpha = torch.sigmoid(
+                routed_features.mv(self.expert_centroids[self.expert_id])
+            ).unsqueeze(1)
+            routed_features = (
+                alpha * self.expert_network(routed_features)
+                + (1 - alpha) * routed_features
+            )
+        # Return to original worker and ordering
+        result = All2All.apply(routed_features, input_splits, output_splits)[
+            self.inverse_sort(sort_by_expert)
+        ]
+
+        if self.shuffle and is_training:
+            # Undo shuffling
+            result = All2All.apply(result)[self.inverse_sort(shuffle_sort)]
+
+        # Return additional Nones for compatibility with TransformerDecoderLayer
+        return result.view(input_features.size()), None, None
+
+    def inverse_sort(self, order):
+        # Creates an index that undoes a sort: xs==xs[order][inverse_sort(order)]
+        return torch.empty_like(order).scatter_(
+            0, order, torch.arange(0, order.size(0), device=order.device)
+        )
+
+    def balanced_assignment(self, scores):
+        ok = scores.isfinite()
+        if not ok.all():
+            # NaNs here can break the assignment algorithm
+            scores[~ok] = scores[ok].min()
+        return self.cpp.balanced_assignment(scores), None, None
+
+    # Assigns each token to the top k experts
+    def greedy_assignment(self, scores, k=1):
+        token_to_workers = torch.topk(scores, dim=1, k=k, largest=True).indices.view(-1)
+        token_to_workers, sort_ordering = torch.sort(token_to_workers)
+        worker2token = sort_ordering // k
+
+        # Find how many tokens we're sending to each other worker (being careful for sending 0 tokens to some workers)
+        output_splits = torch.zeros(
+            (self.num_workers,), dtype=torch.long, device=scores.device
+        )
+        workers, counts = torch.unique_consecutive(token_to_workers, return_counts=True)
+        output_splits[workers] = counts
+        # Tell other workers how many tokens to expect from us
+        input_splits = All2All.apply(output_splits)
+        return worker2token, input_splits.tolist(), output_splits.tolist()
+
+    def load_assignment(self):
+        try:
+            from fairseq import libbase
+
+            return libbase
+
+        except ImportError as e:
+            sys.stderr.write(
+                "ERROR: missing libbase. run `python setup.py build_ext --inplace`\n"
+            )
+            raise e
+
+
+class BaseSublayer(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, "activation_fn", "relu") or "relu"
+        )
+        self.norm = LayerNorm(args.decoder_embed_dim, export=False)
+        self.ff1 = torch.nn.Linear(args.decoder_embed_dim, args.decoder_ffn_embed_dim)
+        self.ff2 = torch.nn.Linear(args.decoder_ffn_embed_dim, args.decoder_embed_dim)
+        self.ff2.weight.data.zero_()
+
+    def forward(self, xs):
+        return xs + self.ff2(self.activation_fn(self.ff1(self.norm(xs))))
+
+
+# Wraps torch.distributed.all_to_all_single as a function that supports autograd
+class All2All(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, xs, input_splits=None, output_splits=None):
+        ctx.input_splits = input_splits
+        ctx.output_splits = output_splits
+
+        ys = (
+            torch.empty_like(xs)
+            if output_splits is None
+            else xs.new_empty(size=[sum(output_splits)] + list(xs.size()[1:]))
+        )
+        torch.distributed.all_to_all_single(
+            ys, xs, output_split_sizes=output_splits, input_split_sizes=input_splits
+        )
+        return ys
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        result = (
+            torch.empty_like(grad_output)
+            if ctx.input_splits is None
+            else grad_output.new_empty(
+                size=[sum(ctx.input_splits)] + list(grad_output.size()[1:])
+            )
+        )
+        torch.distributed.all_to_all_single(
+            result,
+            grad_output,
+            output_split_sizes=ctx.input_splits,
+            input_split_sizes=ctx.output_splits,
+        )
+        return result, None, None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
new file mode 100644
index 0000000000..eff1a4607f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
@@ -0,0 +1,49 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+class BeamableMM(nn.Module):
+    """This module provides an optimized MM for beam decoding with attention.
+
+    It leverage the fact that the source-side of the input is replicated beam
+    times and the target-side of the input is of width one. This layer speeds up
+    inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
+    with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
+    """
+
+    def __init__(self, beam_size=None):
+        super(BeamableMM, self).__init__()
+        self.beam_size = beam_size
+
+    def forward(self, input1, input2):
+        if (
+            not self.training
+            and self.beam_size is not None  # test mode
+            and input1.dim() == 3  # beam size is set
+            and input1.size(1)  # only support batched input
+            == 1  # single time step update
+        ):
+            bsz, beam = input1.size(0), self.beam_size
+
+            # bsz x 1 x nhu --> bsz/beam x beam x nhu
+            input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
+
+            # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
+            input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
+
+            # use non batched operation if bsz = beam
+            if input1.size(0) == 1:
+                output = torch.mm(input1[0, :, :], input2[0, :, :])
+            else:
+                output = input1.bmm(input2)
+            return output.view(bsz, 1, -1)
+        else:
+            return input1.bmm(input2)
+
+    def set_beam_size(self, beam_size):
+        self.beam_size = beam_size
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
new file mode 100644
index 0000000000..181221b61b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
@@ -0,0 +1,214 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from fairseq.data import Dictionary
+from torch import nn
+
+
+CHAR_PAD_IDX = 0
+CHAR_EOS_IDX = 257
+
+
+logger = logging.getLogger(__name__)
+
+
+class CharacterTokenEmbedder(torch.nn.Module):
+    def __init__(
+        self,
+        vocab: Dictionary,
+        filters: List[Tuple[int, int]],
+        char_embed_dim: int,
+        word_embed_dim: int,
+        highway_layers: int,
+        max_char_len: int = 50,
+        char_inputs: bool = False,
+    ):
+        super(CharacterTokenEmbedder, self).__init__()
+
+        self.onnx_trace = False
+        self.embedding_dim = word_embed_dim
+        self.max_char_len = max_char_len
+        self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0)
+        self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim))
+        self.eos_idx, self.unk_idx = 0, 1
+        self.char_inputs = char_inputs
+
+        self.convolutions = nn.ModuleList()
+        for width, out_c in filters:
+            self.convolutions.append(
+                nn.Conv1d(char_embed_dim, out_c, kernel_size=width)
+            )
+
+        last_dim = sum(f[1] for f in filters)
+
+        self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None
+
+        self.projection = nn.Linear(last_dim, word_embed_dim)
+
+        assert (
+            vocab is not None or char_inputs
+        ), "vocab must be set if not using char inputs"
+        self.vocab = None
+        if vocab is not None:
+            self.set_vocab(vocab, max_char_len)
+
+        self.reset_parameters()
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def set_vocab(self, vocab, max_char_len):
+        word_to_char = torch.LongTensor(len(vocab), max_char_len)
+
+        truncated = 0
+        for i in range(len(vocab)):
+            if i < vocab.nspecial:
+                char_idxs = [0] * max_char_len
+            else:
+                chars = vocab[i].encode()
+                # +1 for padding
+                char_idxs = [c + 1 for c in chars] + [0] * (max_char_len - len(chars))
+            if len(char_idxs) > max_char_len:
+                truncated += 1
+                char_idxs = char_idxs[:max_char_len]
+            word_to_char[i] = torch.LongTensor(char_idxs)
+
+        if truncated > 0:
+            logger.info(
+                "truncated {} words longer than {} characters".format(
+                    truncated, max_char_len
+                )
+            )
+
+        self.vocab = vocab
+        self.word_to_char = word_to_char
+
+    @property
+    def padding_idx(self):
+        return Dictionary().pad() if self.vocab is None else self.vocab.pad()
+
+    def reset_parameters(self):
+        nn.init.xavier_normal_(self.char_embeddings.weight)
+        nn.init.xavier_normal_(self.symbol_embeddings)
+        nn.init.xavier_uniform_(self.projection.weight)
+
+        nn.init.constant_(
+            self.char_embeddings.weight[self.char_embeddings.padding_idx], 0.0
+        )
+        nn.init.constant_(self.projection.bias, 0.0)
+
+    def forward(
+        self,
+        input: torch.Tensor,
+    ):
+        if self.char_inputs:
+            chars = input.view(-1, self.max_char_len)
+            pads = chars[:, 0].eq(CHAR_PAD_IDX)
+            eos = chars[:, 0].eq(CHAR_EOS_IDX)
+            if eos.any():
+                if self.onnx_trace:
+                    chars = torch.where(eos.unsqueeze(1), chars.new_zeros(1), chars)
+                else:
+                    chars[eos] = 0
+
+            unk = None
+        else:
+            flat_words = input.view(-1)
+            chars = self.word_to_char[flat_words.type_as(self.word_to_char)].type_as(
+                input
+            )
+            pads = flat_words.eq(self.vocab.pad())
+            eos = flat_words.eq(self.vocab.eos())
+            unk = flat_words.eq(self.vocab.unk())
+
+        word_embs = self._convolve(chars)
+        if self.onnx_trace:
+            if pads.any():
+                word_embs = torch.where(
+                    pads.unsqueeze(1), word_embs.new_zeros(1), word_embs
+                )
+            if eos.any():
+                word_embs = torch.where(
+                    eos.unsqueeze(1), self.symbol_embeddings[self.eos_idx], word_embs
+                )
+            if unk is not None and unk.any():
+                word_embs = torch.where(
+                    unk.unsqueeze(1), self.symbol_embeddings[self.unk_idx], word_embs
+                )
+        else:
+            if pads.any():
+                word_embs[pads] = 0
+            if eos.any():
+                word_embs[eos] = self.symbol_embeddings[self.eos_idx]
+            if unk is not None and unk.any():
+                word_embs[unk] = self.symbol_embeddings[self.unk_idx]
+
+        return word_embs.view(input.size()[:2] + (-1,))
+
+    def _convolve(
+        self,
+        char_idxs: torch.Tensor,
+    ):
+        char_embs = self.char_embeddings(char_idxs)
+        char_embs = char_embs.transpose(1, 2)  # BTC -> BCT
+
+        conv_result = []
+
+        for conv in self.convolutions:
+            x = conv(char_embs)
+            x, _ = torch.max(x, -1)
+            x = F.relu(x)
+            conv_result.append(x)
+
+        x = torch.cat(conv_result, dim=-1)
+
+        if self.highway is not None:
+            x = self.highway(x)
+        x = self.projection(x)
+
+        return x
+
+
+class Highway(torch.nn.Module):
+    """
+    A `Highway layer <https://arxiv.org/abs/1505.00387>`_.
+    Adopted from the AllenNLP implementation.
+    """
+
+    def __init__(self, input_dim: int, num_layers: int = 1):
+        super(Highway, self).__init__()
+        self.input_dim = input_dim
+        self.layers = nn.ModuleList(
+            [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)]
+        )
+        self.activation = nn.ReLU()
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for layer in self.layers:
+            # As per comment in AllenNLP:
+            # We should bias the highway layer to just carry its input forward.  We do that by
+            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
+            # be high, so we will carry the input forward.  The bias on `B(x)` is the second half
+            # of the bias vector in each Linear layer.
+            nn.init.constant_(layer.bias[self.input_dim :], 1)
+
+            nn.init.constant_(layer.bias[: self.input_dim], 0)
+            nn.init.xavier_normal_(layer.weight)
+
+    def forward(self, x: torch.Tensor):
+        for layer in self.layers:
+            projection = layer(x)
+            proj_x, gate = projection.chunk(2, dim=-1)
+            proj_x = self.activation(proj_x)
+            gate = torch.sigmoid(gate)
+            x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
new file mode 100644
index 0000000000..aa0b5929a3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+import torch.utils.checkpoint as checkpoint
+from fairseq import utils
+
+
+def checkpoint_wrapper(m, offload_to_cpu=False):
+    """
+    A friendlier wrapper for performing activation checkpointing.
+
+    Compared to the PyTorch version, this version:
+    - wraps an nn.Module, so that all subsequent calls will use checkpointing
+    - handles keyword arguments in the forward
+    - handles non-Tensor outputs from the forward
+
+    Usage::
+
+        checkpointed_module = checkpoint_wrapper(my_module, offload_to_cpu=True)
+        a, b = checkpointed_module(x, y=3, z=torch.Tensor([1]))
+    """
+    # should I check whether original_forward has already been set?
+    assert not hasattr(
+        m, "precheckpoint_forward"
+    ), "checkpoint function has already been applied?"
+    m.precheckpoint_forward = m.forward
+    m.forward = functools.partial(
+        _checkpointed_forward,
+        m.precheckpoint_forward,  # original_forward
+        offload_to_cpu,
+    )
+    return m
+
+
+def unwrap_checkpoint(m: torch.nn.Module):
+    """
+    unwrap a module and its children from checkpoint_wrapper
+    """
+    for module in m.modules():
+        if hasattr(module, "precheckpoint_forward"):
+            module.forward = module.precheckpoint_forward
+            del module.precheckpoint_forward
+        if hasattr(module, "old_deepcopy_method"):
+            module.__deepcopy__ = module.old_deepcopy_method
+            del module.old_deepcopy_method
+    return m
+
+
+def _checkpointed_forward(original_forward, offload_to_cpu, *args, **kwargs):
+    # Autograd Functions in PyTorch work best with positional args, since
+    # the backward must return gradients (or None) for every input argument.
+    # We can flatten keyword arguments to make this easier.
+    kwarg_keys, flat_args = pack_kwargs(*args, **kwargs)
+    parent_ctx_dict = {"offload": offload_to_cpu}
+    output = CheckpointFunction.apply(
+        original_forward, parent_ctx_dict, kwarg_keys, *flat_args
+    )
+    if isinstance(output, torch.Tensor):
+        return output
+    else:
+        packed_non_tensor_outputs = parent_ctx_dict["packed_non_tensor_outputs"]
+        if packed_non_tensor_outputs:
+            output = unpack_non_tensors(output, packed_non_tensor_outputs)
+        return output
+
+
+def pack_kwargs(*args, **kwargs) -> Tuple[List[str], List[Any]]:
+    """
+    Usage::
+
+        kwarg_keys, flat_args = pack_kwargs(1, 2, a=3, b=4)
+        args, kwargs = unpack_kwargs(kwarg_keys, flat_args)
+        assert args == [1, 2]
+        assert kwargs == {"a": 3, "b": 4}
+    """
+    kwarg_keys = []
+    flat_args = list(args)
+    for k, v in kwargs.items():
+        kwarg_keys.append(k)
+        flat_args.append(v)
+    return kwarg_keys, flat_args
+
+
+def unpack_kwargs(
+    kwarg_keys: List[str], flat_args: List[Any]
+) -> Tuple[List[Any], Dict[str, Any]]:
+    if len(kwarg_keys) == 0:
+        return flat_args, {}
+    args = flat_args[: -len(kwarg_keys)]
+    kwargs = {k: v for k, v in zip(kwarg_keys, flat_args[-len(kwarg_keys) :])}
+    return args, kwargs
+
+
+def split_non_tensors(
+    mixed: Union[torch.Tensor, Tuple[Any]]
+) -> Tuple[Tuple[torch.Tensor], Dict[str, List[Any]]]:
+    """
+    Usage::
+
+        x = torch.Tensor([1])
+        y = torch.Tensor([2])
+        tensors, packed_non_tensors = split_non_tensors((x, y, None, 3))
+        recon = unpack_non_tensors(tensors, packed_non_tensors)
+        assert recon == (x, y, None, 3)
+    """
+    if isinstance(mixed, torch.Tensor):
+        return (mixed,), None
+    tensors = []
+    packed_non_tensors = {"is_tensor": [], "objects": []}
+    for o in mixed:
+        if isinstance(o, torch.Tensor):
+            packed_non_tensors["is_tensor"].append(True)
+            tensors.append(o)
+        else:
+            packed_non_tensors["is_tensor"].append(False)
+            packed_non_tensors["objects"].append(o)
+    return tuple(tensors), packed_non_tensors
+
+
+def unpack_non_tensors(
+    tensors: Tuple[torch.Tensor],
+    packed_non_tensors: Dict[str, List[Any]],
+) -> Tuple[Any]:
+    if packed_non_tensors is None:
+        return tensors
+    assert isinstance(packed_non_tensors, dict)
+    mixed = []
+    is_tensor_list = packed_non_tensors["is_tensor"]
+    objects = packed_non_tensors["objects"]
+    assert len(tensors) + len(objects) == len(is_tensor_list)
+    obj_i = tnsr_i = 0
+    for is_tensor in is_tensor_list:
+        if is_tensor:
+            mixed.append(tensors[tnsr_i])
+            tnsr_i += 1
+        else:
+            mixed.append(objects[obj_i])
+            obj_i += 1
+    return tuple(mixed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """Similar to the torch version, but support non-Tensor outputs.
+
+    The caller is expected to provide a dict (*parent_ctx_dict*) that will hold
+    the non-Tensor outputs. These should be combined with the Tensor *outputs*
+    by calling ``unpack_non_tensors``.
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, parent_ctx_dict, kwarg_keys, *args):
+        if torch.is_grad_enabled():  # grad may be disabled, e.g., during validation
+            checkpoint.check_backward_validity(args)
+
+        ctx.run_function = run_function
+        ctx.kwarg_keys = kwarg_keys
+        ctx.fwd_rng_state = utils.get_rng_state()
+
+        tensor_inputs, packed_non_tensor_inputs = split_non_tensors(args)
+        if parent_ctx_dict["offload"]:
+            ctx.fwd_device = tuple(x.device for x in tensor_inputs)
+            ctx.grad_requirements = tuple(x.requires_grad for x in tensor_inputs)
+            tensor_inputs = tuple(
+                x.to(torch.device("cpu"), non_blocking=True) for x in tensor_inputs
+            )
+
+        else:
+            ctx.fwd_device, ctx.grad_requirements = None, None
+
+        ctx.save_for_backward(*tensor_inputs)
+        ctx.packed_non_tensor_inputs = packed_non_tensor_inputs
+
+        with torch.no_grad():
+            unpacked_args, unpacked_kwargs = unpack_kwargs(kwarg_keys, args)
+            outputs = run_function(*unpacked_args, **unpacked_kwargs)
+
+        if isinstance(outputs, torch.Tensor):
+            return outputs
+        else:
+            # Autograd Functions don't like non-Tensor outputs. We can split the
+            # non-Tensor and Tensor outputs, returning the former by reference
+            # through *parent_ctx_dict* and returning the latter directly.
+            outputs, packed_non_tensor_outputs = split_non_tensors(outputs)
+            parent_ctx_dict["packed_non_tensor_outputs"] = packed_non_tensor_outputs
+            return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad(), please use .backward() if possible"
+            )
+
+        tensor_inputs: Tuple = ctx.saved_tensors
+        tensor_inputs = checkpoint.detach_variable(tensor_inputs)
+        if ctx.fwd_device is not None:
+            tensor_inputs = [
+                t.to(ctx.fwd_device[i], non_blocking=True)
+                for i, t in enumerate(tensor_inputs)
+            ]
+            for i, need_grad in enumerate(ctx.grad_requirements):
+                tensor_inputs[i].requires_grad = need_grad
+        inputs = unpack_non_tensors(tensor_inputs, ctx.packed_non_tensor_inputs)
+
+        # Store the current states.
+        bwd_rng_state = utils.get_rng_state()
+
+        # Set the states to what it used to be before the forward pass.
+        utils.set_rng_state(ctx.fwd_rng_state)
+
+        with torch.enable_grad():
+            unpacked_args, unpacked_kwargs = unpack_kwargs(ctx.kwarg_keys, inputs)
+            outputs = ctx.run_function(*unpacked_args, **unpacked_kwargs)
+            tensor_outputs, _ = split_non_tensors(outputs)
+        # Set the states back to what it was at the start of this function.
+        utils.set_rng_state(bwd_rng_state)
+
+        # Run backward() with only Tensors that require grad
+        outputs_with_grad = []
+        args_with_grad = []
+        for i in range(len(tensor_outputs)):
+            if tensor_outputs[i].requires_grad:
+                outputs_with_grad.append(tensor_outputs[i])
+                args_with_grad.append(args[i])
+        if len(outputs_with_grad) == 0:
+            raise RuntimeError(
+                "None of the outputs have requires_grad=True, "
+                "this checkpoint() is not necessary"
+            )
+
+        torch.autograd.backward(outputs_with_grad, args_with_grad)
+
+        grads = tuple(
+            inp.grad if isinstance(inp, torch.Tensor) else None for inp in inputs
+        )
+        return (None, None, None) + grads
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
new file mode 100644
index 0000000000..964af243ec
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
@@ -0,0 +1,301 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Optional
+
+import torch
+
+from fairseq.modules import (
+    ESPNETMultiHeadedAttention,
+    LayerNorm,
+    MultiheadAttention,
+    RelPositionMultiHeadedAttention,
+    RotaryPositionMultiHeadedAttention,
+)
+from fairseq.utils import get_activation_fn
+
+
+class ConvolutionModule(torch.nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(
+        self,
+        embed_dim,
+        channels,
+        depthwise_kernel_size,
+        dropout,
+        activation_fn="swish",
+        bias=False,
+        export=False,
+    ):
+        """
+        Args:
+            embed_dim: Embedding dimension
+            channels: Number of channels in depthwise conv layers
+            depthwise_kernel_size: Depthwise conv layer kernel size
+            dropout: dropout value
+            activation_fn: Activation function to use after depthwise convolution kernel
+            bias: If bias should be added to conv layers
+            export: If layernorm should be exported to jit
+        """
+        super(ConvolutionModule, self).__init__()
+        assert (
+            depthwise_kernel_size - 1
+        ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
+        self.layer_norm = LayerNorm(embed_dim, export=export)
+        self.pointwise_conv1 = torch.nn.Conv1d(
+            embed_dim,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.glu = torch.nn.GLU(dim=1)
+        self.depthwise_conv = torch.nn.Conv1d(
+            channels,
+            channels,
+            depthwise_kernel_size,
+            stride=1,
+            padding=(depthwise_kernel_size - 1) // 2,
+            groups=channels,
+            bias=bias,
+        )
+        self.batch_norm = torch.nn.BatchNorm1d(channels)
+        self.activation = get_activation_fn(activation_fn)(channels)
+        self.pointwise_conv2 = torch.nn.Conv1d(
+            channels,
+            embed_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.dropout = torch.nn.Dropout(dropout)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input of shape B X T X C
+        Returns:
+          Tensor of shape B X T X C
+        """
+        x = self.layer_norm(x)
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = self.glu(x)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.batch_norm(x)
+        x = self.activation(x)
+
+        x = self.pointwise_conv2(x)
+        x = self.dropout(x)
+        return x.transpose(1, 2)
+
+
+class FeedForwardModule(torch.nn.Module):
+    """Positionwise feed forward layer used in conformer"""
+
+    def __init__(
+        self,
+        input_feat,
+        hidden_units,
+        dropout1,
+        dropout2,
+        activation_fn="swish",
+        bias=True,
+    ):
+        """
+        Args:
+            input_feat: Input feature dimension
+            hidden_units: Hidden unit dimension
+            dropout1: dropout value for layer1
+            dropout2: dropout value for layer2
+            activation_fn: Name of activation function
+            bias: If linear layers should have bias
+        """
+
+        super(FeedForwardModule, self).__init__()
+        self.layer_norm = LayerNorm(input_feat)
+        self.w_1 = torch.nn.Linear(input_feat, hidden_units, bias=bias)
+        self.w_2 = torch.nn.Linear(hidden_units, input_feat, bias=bias)
+        self.dropout1 = torch.nn.Dropout(dropout1)
+        self.dropout2 = torch.nn.Dropout(dropout2)
+        self.activation = get_activation_fn(activation_fn)(hidden_units)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input Tensor of shape  T X B X C
+        Returns:
+            Tensor of shape T X B X C
+        """
+        x = self.layer_norm(x)
+        x = self.w_1(x)
+        x = self.activation(x)
+        x = self.dropout1(x)
+        x = self.w_2(x)
+        return self.dropout2(x)
+
+
+class ConformerEncoderLayer(torch.nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100. We currently don't support relative positional encoding in MHA"""
+
+    def __init__(
+        self,
+        embed_dim,
+        ffn_embed_dim,
+        attention_heads,
+        dropout,
+        use_fp16,
+        depthwise_conv_kernel_size=31,
+        activation_fn="swish",
+        attn_type=None,
+        pos_enc_type="abs",
+    ):
+        """
+        Args:
+            embed_dim: Input embedding dimension
+            ffn_embed_dim: FFN layer dimension
+            attention_heads: Number of attention heads in MHA
+            dropout: dropout value
+            depthwise_conv_kernel_size: Size of kernel in depthwise conv layer in convolution module
+            activation_fn: Activation function name to use in convulation block and feed forward block
+            attn_type: MHA implementation from ESPNET vs fairseq
+            pos_enc_type: Positional encoding type - abs, rope, rel_pos
+        """
+        self.pos_enc_type = pos_enc_type
+        super(ConformerEncoderLayer, self).__init__()
+
+        self.ffn1 = FeedForwardModule(
+            embed_dim,
+            ffn_embed_dim,
+            dropout,
+            dropout,
+        )
+
+        self.self_attn_layer_norm = LayerNorm(embed_dim, export=False)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        if attn_type == "espnet":
+            if self.pos_enc_type == "rel_pos":
+                self.self_attn = RelPositionMultiHeadedAttention(
+                    embed_dim,
+                    attention_heads,
+                    dropout=dropout,
+                )
+            elif self.pos_enc_type == "rope":
+                self.self_attn = RotaryPositionMultiHeadedAttention(
+                    embed_dim, attention_heads, dropout=dropout, precision=use_fp16
+                )
+            elif self.pos_enc_type == "abs":
+                self.self_attn = ESPNETMultiHeadedAttention(
+                    embed_dim,
+                    attention_heads,
+                    dropout=dropout,
+                )
+            else:
+                raise Exception(f"Unsupported attention type {self.pos_enc_type}")
+        else:
+            # Default to fairseq MHA
+            self.self_attn = MultiheadAttention(
+                embed_dim,
+                attention_heads,
+                dropout=dropout,
+            )
+
+        self.conv_module = ConvolutionModule(
+            embed_dim=embed_dim,
+            channels=embed_dim,
+            depthwise_kernel_size=depthwise_conv_kernel_size,
+            dropout=dropout,
+            activation_fn=activation_fn,
+        )
+
+        self.ffn2 = FeedForwardModule(
+            embed_dim,
+            ffn_embed_dim,
+            dropout,
+            dropout,
+            activation_fn=activation_fn,
+        )
+        self.final_layer_norm = LayerNorm(embed_dim, export=False)
+
+    def forward(
+        self,
+        x,
+        encoder_padding_mask: Optional[torch.Tensor],
+        position_emb: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x: Tensor of shape T X B X C
+            encoder_padding_mask: Optional mask tensor
+            positions:
+        Returns:
+            Tensor of shape T X B X C
+        """
+        residual = x
+        x = self.ffn1(x)
+        x = x * 0.5 + residual
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        if self.pos_enc_type == "rel_pos":
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask,
+                pos_emb=position_emb,
+                need_weights=False,
+            )
+        else:
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask,
+                need_weights=False,
+            )
+        x = self.self_attn_dropout(x)
+        x = x + residual
+
+        residual = x
+        # TBC to BTC
+        x = x.transpose(0, 1)
+        x = self.conv_module(x)
+        # BTC to TBC
+        x = x.transpose(0, 1)
+        x = residual + x
+
+        residual = x
+        x = self.ffn2(x)
+
+        layer_result = x
+
+        x = x * 0.5 + residual
+
+        x = self.final_layer_norm(x)
+        return x, (attn, layer_result)
+
+
+class ConformerWav2Vec2EncoderLayer(ConformerEncoderLayer):
+    """Encoder layer for Wav2vec2 encoder"""
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: torch.Tensor = None,
+        self_attn_padding_mask: torch.Tensor = None,
+        need_weights: bool = False,
+        att_args=None,
+        position_emb=None,
+    ):
+        return super().forward(x, self_attn_padding_mask, position_emb)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
new file mode 100644
index 0000000000..65e17ec94f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
@@ -0,0 +1,53 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch import nn
+from torch.nn.modules.utils import _single
+from torch import Tensor
+
+
+class ConvTBC(torch.nn.Module):
+    """1D convolution over an input of shape (time x batch x channel)
+
+    The implementation uses gemm to perform the convolution. This implementation
+    is faster than cuDNN for small kernel sizes.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
+        super(ConvTBC, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _single(kernel_size)
+        self.padding = _single(padding)
+
+        self.weight = torch.nn.Parameter(
+            torch.Tensor(self.kernel_size[0], in_channels, out_channels)
+        )
+        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_normal_(self.weight)
+        nn.init.zeros_(self.bias)
+
+    def conv_tbc(self, input: Tensor):
+        return torch.conv_tbc(
+            input.contiguous(), self.weight, self.bias, self.padding[0]
+        )
+
+    def forward(self, input: Tensor):
+        return self.conv_tbc(input)
+
+    def __repr__(self):
+        s = (
+            "{name}({in_channels}, {out_channels}, kernel_size={kernel_size}"
+            ", padding={padding}"
+        )
+        if self.bias is None:
+            s += ", bias=False"
+        s += ")"
+        return s.format(name=self.__class__.__name__, **self.__dict__)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
new file mode 100644
index 0000000000..286c00eecc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
@@ -0,0 +1,59 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn.functional as F
+
+logger = logging.getLogger(__name__)
+
+
+def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"):
+    lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    return F.nll_loss(
+        lprobs,
+        target,
+        ignore_index=ignore_index,
+        reduction=reduction,
+    )
+
+
+try:
+    import xentropy_cuda
+    from apex.contrib import xentropy
+
+    def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
+        if logits.device == torch.device("cpu"):
+            return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
+        else:
+            if not getattr(cross_entropy, "_has_logged_once", False):
+                logger.info("using fused cross entropy")
+                cross_entropy._has_logged_once = True
+
+            half_to_float = logits.dtype == torch.half
+            losses = xentropy.SoftmaxCrossEntropyLoss.apply(
+                logits,
+                target,
+                0.0,
+                ignore_index,
+                half_to_float,
+            )
+            if reduction == "sum":
+                return losses.sum()
+            elif reduction == "mean":
+                if ignore_index >= 0:
+                    return losses.sum() / target.ne(ignore_index).sum()
+                else:
+                    return losses.mean()
+            elif reduction == "none":
+                return losses
+            else:
+                raise NotImplementedError
+
+except ImportError:
+
+    def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
+        return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cuda_utils.cu b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cuda_utils.cu
new file mode 100644
index 0000000000..924f852758
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cuda_utils.cu
@@ -0,0 +1,202 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+template <typename U, typename V>
+constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
+  return (a + b - 1) / b;
+}
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__inline__ __device__ void zeroSharedMem(scalar_t* data) {
+  /*
+    Given an array of length FS + SB, zero out the first padding_l and last
+    (FS - padding_l) values in the array
+  */
+
+  int tid = threadIdx.x;
+
+  if (FS < SB) {
+    // zero all if we have enough threads in a block to do all of them
+    if (tid < padding_l || tid > SB - FS + padding_l - 1) {
+      data[tid] = scalar_t(0.0);
+    }
+  } else {
+    // otherwise zero out one block at a time
+    const int numIterations = divUp<int, int>(FS, SB);
+    for (int i = 0; i < numIterations; i++) {
+      int offset = i * SB;
+      if (tid + offset < padding_l) {
+        data[tid + offset] = scalar_t(0.0);
+      } else if (tid + offset < FS) {
+        data[SB + tid + offset] = scalar_t(0.0);
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__inline__ __device__ scalar_t warpReduce(scalar_t data) {
+  /*
+    Reduce an array within each warp. After processing all values in warp will
+    caontain the sum of all original values in that warp.
+
+    data - pointer to data to reduce
+  */
+  data += __shfl_xor_sync(SHFL_MASK, data, 16);
+  data += __shfl_xor_sync(SHFL_MASK, data, 8);
+  data += __shfl_xor_sync(SHFL_MASK, data, 4);
+  data += __shfl_xor_sync(SHFL_MASK, data, 2);
+  data += __shfl_xor_sync(SHFL_MASK, data, 1);
+  return data;
+}
+
+template <typename scalar_t>
+__inline__ __device__ scalar_t blockReduce(scalar_t data) {
+  /*
+     Reduce an entire array on the block level. After processing, the
+     first value in the array will contain the reduced sum.
+
+     data - pointer to data to reduce
+  */
+
+  static __shared__ scalar_t warpSum[32];
+  const int tid = threadIdx.x;
+  int wid = tid / 32;
+  int lane = tid % 32;
+
+  __syncthreads();
+
+  // reduce each warp then write to shared memory
+  scalar_t sum = warpReduce(data);
+  if (lane == 0) {
+    warpSum[wid] = sum;
+  }
+
+  __syncthreads();
+
+  scalar_t v;
+  // perform final sum of partial warp sums
+  if (tid < blockDim.x / 32) {
+    v = warpSum[lane];
+  } else {
+    v = scalar_t(0.0);
+  }
+
+  if (wid == 0) {
+    v = warpReduce(v);
+  }
+  __syncthreads();
+
+  return v;
+}
+
+void checkCudaStatus(cudaError_t status, int lineNumber = -1) {
+  if (status != cudaSuccess) {
+    std::cout << cudaGetErrorString(status) << " at line " << lineNumber
+              << std::endl;
+    std::cout << "Exiting" << std::endl;
+    exit(1);
+  }
+}
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__device__ void load_input_to_shared(
+    const scalar_t* input, // global memory
+    int inputOffset,
+    int sequenceLength,
+    int iteration,
+    int numIterations,
+    bool no_prev,
+    scalar_t* output /* shared memory */) {
+  /*
+    Load a block size of input into shared memory with
+    right and left overhang of total size FS. If previously
+    loaded memory, overlap will be shifted over to reduce
+    global memory access
+
+    input - pointer to start of channel sequence
+    inputOffset - how far in the sequence to start loading
+    sequenceLength - total length of sequence
+    iteration - which block of sequence we are loading
+    numIterations - total number of blocks to load
+    no_prev - whether to load the whole block if the previous block
+              wasn't loaded
+    output - shared memory to write input to
+  */
+
+  const int tid = threadIdx.x;
+
+  // Load the left "overhang" of input
+  if (iteration > 0) {
+    if (padding_l < SB) {
+      // load all at once
+      if (tid < padding_l) {
+        output[tid] =
+            (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB];
+      }
+    } else {
+      // load in chunks of size SB
+      int numIterations = divUp<int, int>(padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < padding_l) {
+          output[tid + offset] = (no_prev)
+              ? input[inputOffset - padding_l + tid + offset]
+              : output[tid + offset + SB];
+        }
+      }
+    }
+  }
+
+  // Load the right "overhang" of input
+  if (iteration < (numIterations - 1)) {
+    const int elementsLeft = sequenceLength - (iteration + 1) * SB;
+
+    if ((FS - padding_l) < SB) {
+      // load all at once
+      if (tid < (FS - padding_l)) {
+        output[padding_l + SB + tid] = (tid < elementsLeft)
+            ? input[inputOffset + SB + tid]
+            : scalar_t(0.0);
+      }
+    } else {
+      // load in chunks of size SB
+      int numIterations = divUp<int, int>(FS - padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < (FS - padding_l)) {
+          output[padding_l + SB + tid + offset] =
+              ((tid + offset) < elementsLeft)
+              ? input[inputOffset + SB + tid + offset]
+              : scalar_t(0.0);
+        }
+      }
+    }
+  }
+
+  // We should also clear out the right "overhang"
+  if (iteration == (numIterations - 1)) {
+    if ((FS - padding_l) < SB) {
+      // clear out all at once
+      if (tid < (FS - padding_l)) {
+        output[padding_l + SB + tid] = scalar_t(0.0);
+      }
+    } else {
+      // clear in chunks of size SB
+      int numIterations = divUp<int, int>(FS - padding_l, SB);
+      for (int i = 0; i < numIterations; i++) {
+        int offset = i * SB;
+        if ((tid + offset) < (FS - padding_l)) {
+          output[padding_l + SB + tid + offset] = scalar_t(0.0);
+        }
+      }
+    }
+  }
+  output[tid + padding_l] = ((inputOffset + tid) < sequenceLength)
+      ? input[inputOffset + tid]
+      : scalar_t(0.0);
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
new file mode 100644
index 0000000000..5e42942a9f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
@@ -0,0 +1,317 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.scalar_bias import scalar_bias
+
+
+class SingleHeadAttention(nn.Module):
+    """
+    Single-head attention that supports Gating and Downsampling
+    """
+
+    def __init__(
+        self,
+        out_channels,
+        embed_dim,
+        head_dim,
+        head_index,
+        dropout=0.0,
+        bias=True,
+        project_input=True,
+        gated=False,
+        downsample=False,
+        num_heads=1,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.head_index = head_index
+        self.head_dim = head_dim
+        self.project_input = project_input
+        self.gated = gated
+        self.downsample = downsample
+        self.num_heads = num_heads
+        self.projection = None
+
+        k_layers = []
+        v_layers = []
+        if self.downsample:
+            k_layers.append(Downsample(self.head_index))
+            v_layers.append(Downsample(self.head_index))
+            out_proj_size = self.head_dim
+        else:
+            out_proj_size = self.head_dim * self.num_heads
+        if self.gated:
+            k_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
+            self.in_proj_q = GatedLinear(self.embed_dim, out_proj_size, bias=bias)
+            v_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
+        else:
+            k_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
+            self.in_proj_q = Linear(self.embed_dim, out_proj_size, bias=bias)
+            v_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
+
+        self.in_proj_k = nn.Sequential(*k_layers)
+        self.in_proj_v = nn.Sequential(*v_layers)
+
+        if self.downsample:
+            self.out_proj = Linear(out_proj_size, self.head_dim, bias=bias)
+        else:
+            self.out_proj = Linear(out_proj_size, out_channels, bias=bias)
+
+        self.scaling = self.head_dim**-0.5
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        mask_future_timesteps=False,
+        key_padding_mask=None,
+        use_scalar_bias=False,
+    ):
+        """Input shape: Time x Batch x Channel
+        Self-attention can be implemented by passing in the same arguments for
+        query, key and value. Future timesteps can be masked with the
+        `mask_future_timesteps` argument. Padding elements can be excluded from
+        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
+        batch x src_len, where padding elements are indicated by 1s.
+        """
+        src_len, bsz, out_channels = key.size()
+        tgt_len = query.size(0)
+        assert list(query.size()) == [tgt_len, bsz, out_channels]
+        assert key.size() == value.size()
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.downsample:
+            size = bsz
+        else:
+            size = bsz * self.num_heads
+
+        k = key
+        v = value
+        q = query
+        if self.project_input:
+            q = self.in_proj_q(q)
+            k = self.in_proj_k(k)
+            v = self.in_proj_v(v)
+            src_len = k.size()[0]
+        q *= self.scaling
+
+        if not self.downsample:
+            q = q.view(tgt_len, size, self.head_dim)
+            k = k.view(src_len, size, self.head_dim)
+            v = v.view(src_len, size, self.head_dim)
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        if mask_future_timesteps:
+            assert (
+                query.size() == key.size()
+            ), "mask_future_timesteps only applies to self-attention"
+            attn_weights *= torch.tril(
+                attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(),
+                diagonal=-1,
+            )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0)
+            attn_weights += torch.triu(
+                attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(),
+                diagonal=0,
+            )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0)
+        tgt_size = tgt_len
+        if use_scalar_bias:
+            attn_weights = scalar_bias(attn_weights, 2)
+            v = scalar_bias(v, 1)
+            tgt_size += 1
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            if key_padding_mask.max() > 0:
+                if self.downsample:
+                    attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len)
+                else:
+                    attn_weights = attn_weights.view(
+                        size, self.num_heads, tgt_len, src_len
+                    )
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2),
+                    -math.inf,
+                )
+                attn_weights = attn_weights.view(size, tgt_len, src_len)
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout_module(attn_weights)
+
+        attn = torch.bmm(attn_weights, v)
+        if self.downsample:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
+
+        attn = self.out_proj(attn)
+
+        return attn, attn_weights
+
+
+class DownsampledMultiHeadAttention(nn.ModuleList):
+    """
+    Multi-headed attention with Gating and Downsampling
+    """
+
+    def __init__(
+        self,
+        out_channels,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        project_input=True,
+        gated=False,
+        downsample=False,
+    ):
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.downsample = downsample
+        self.gated = gated
+        self.project_input = project_input
+        assert self.head_dim * num_heads == embed_dim
+
+        if self.downsample:
+            attention_heads = []
+            for index in range(self.num_heads):
+                attention_heads.append(
+                    SingleHeadAttention(
+                        out_channels,
+                        self.embed_dim,
+                        self.head_dim,
+                        index,
+                        dropout,
+                        bias,
+                        self.project_input,
+                        self.gated,
+                        self.downsample,
+                        self.num_heads,
+                    )
+                )
+            super().__init__(modules=attention_heads)
+            self.out_proj = Linear(embed_dim, out_channels, bias=bias)
+        else:
+            # either we have a list of attention heads, or just one attention head
+            # if not being downsampled, we can do the heads with one linear layer instead of separate ones
+            super().__init__()
+            self.attention_module = SingleHeadAttention(
+                out_channels,
+                self.embed_dim,
+                self.head_dim,
+                1,
+                dropout,
+                bias,
+                self.project_input,
+                self.gated,
+                self.downsample,
+                self.num_heads,
+            )
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        mask_future_timesteps=False,
+        key_padding_mask=None,
+        use_scalar_bias=False,
+    ):
+        src_len, bsz, embed_dim = key.size()
+        tgt_len = query.size(0)
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        assert key.size() == value.size()
+
+        tgt_size = tgt_len
+        if use_scalar_bias:
+            tgt_size += 1
+
+        attn = []
+        attn_weights = []
+        if self.downsample:
+            for attention_head_number in range(self.num_heads):
+                # call the forward of each attention head
+                _attn, _attn_weight = self[attention_head_number](
+                    query,
+                    key,
+                    value,
+                    mask_future_timesteps,
+                    key_padding_mask,
+                    use_scalar_bias,
+                )
+                attn.append(_attn)
+                attn_weights.append(_attn_weight)
+            full_attn = torch.cat(attn, dim=2)
+            full_attn = self.out_proj(full_attn)
+            return full_attn, attn_weights[0].clone()
+        else:
+            _attn, _attn_weight = self.attention_module(
+                query,
+                key,
+                value,
+                mask_future_timesteps,
+                key_padding_mask,
+                use_scalar_bias,
+            )
+            attn.append(_attn)
+            attn_weights.append(_attn_weight)
+            full_attn = torch.cat(attn, dim=2)
+            full_attn_weights = torch.cat(attn_weights)
+            full_attn_weights = full_attn_weights.view(
+                bsz, self.num_heads, tgt_size, src_len
+            )
+            full_attn_weights = full_attn_weights.sum(dim=1) / self.num_heads
+            return full_attn, full_attn_weights
+
+
+class Downsample(nn.Module):
+    """
+    Selects every nth element, where n is the index
+    """
+
+    def __init__(self, index):
+        super().__init__()
+        self.index = index
+
+    def forward(self, x):
+        return x[:: self.index + 1]
+
+
+def Linear(in_features, out_features, dropout=0.0, bias=True):
+    """Weight-normalized Linear layer (input: B x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+    m.bias.data.zero_()
+    return nn.utils.weight_norm(m)
+
+
+def GatedLinear(in_features, out_features, dropout=0.0, bias=True):
+    """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units"""
+    return nn.Sequential(
+        Linear(in_features, out_features * 4, dropout, bias),
+        nn.GLU(),
+        Linear(out_features * 2, out_features * 2, dropout, bias),
+        nn.GLU(),
+        Linear(out_features, out_features, dropout, bias),
+    )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
new file mode 100644
index 0000000000..0ff02cd62a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
@@ -0,0 +1,526 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import (
+    FairseqIncrementalState,
+    with_incremental_state,
+)
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from torch import Tensor
+
+from .unfold import unfold1d
+
+
+def DynamicConv(
+    input_size,
+    kernel_size=1,
+    padding_l=None,
+    num_heads=1,
+    weight_dropout=0.0,
+    weight_softmax=False,
+    renorm_padding=False,
+    bias=False,
+    conv_bias=False,
+    query_size=None,
+    in_proj=False,
+):
+    if torch.cuda.is_available():
+        try:
+            from fairseq.modules.dynamicconv_layer import DynamicconvLayer
+
+            return DynamicconvLayer(
+                input_size,
+                kernel_size=kernel_size,
+                padding_l=padding_l,
+                num_heads=num_heads,
+                weight_dropout=weight_dropout,
+                weight_softmax=weight_softmax,
+                renorm_padding=renorm_padding,
+                bias=bias,
+                conv_bias=conv_bias,
+                query_size=query_size,
+            )
+        except ImportError as e:
+            print(e)
+    return DynamicConv1dTBC(
+        input_size,
+        kernel_size=kernel_size,
+        padding_l=padding_l,
+        num_heads=num_heads,
+        weight_dropout=weight_dropout,
+        weight_softmax=weight_softmax,
+        renorm_padding=renorm_padding,
+        bias=bias,
+        conv_bias=conv_bias,
+        query_size=query_size,
+    )
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.0)
+    return m
+
+
+@with_incremental_state
+class DynamicConv1dTBC(nn.Module):
+    """Dynamic lightweight convolution taking T x B x C inputs
+    Args:
+        input_size: # of channels of the input
+        kernel_size: convolution channels
+        padding_l: padding to the left when using "same" padding
+        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
+        weight_dropout: the drop rate of the DropConnect to drop the weight
+        weight_softmax: normalize the weight with softmax before the convolution
+        renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1)
+        bias: use bias
+        conv_bias: bias of the convolution
+        query_size: specified when feeding a different input as the query
+        in_proj: project the input and generate the filter together
+
+    Shape:
+        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
+        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias:   the learnable bias of the module of shape `(input_size)`
+    """
+
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding_l=None,
+        num_heads=1,
+        weight_dropout=0.0,
+        weight_softmax=False,
+        renorm_padding=False,
+        bias=False,
+        conv_bias=False,
+        query_size=None,
+        in_proj=False,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.query_size = input_size if query_size is None else query_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+        self.weight_softmax = weight_softmax
+        self.renorm_padding = renorm_padding
+
+        if in_proj:
+            self.weight_linear = Linear(
+                self.input_size, self.input_size + num_heads * kernel_size * 1
+            )
+        else:
+            self.weight_linear = Linear(
+                self.query_size, num_heads * kernel_size * 1, bias=bias
+            )
+        if conv_bias:
+            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.conv_bias = None
+        self.reset_parameters()
+
+    @property
+    def in_proj(self):
+        return (
+            self.weight_linear.out_features
+            == self.input_size + self.num_heads * self.kernel_size
+        )
+
+    def reset_parameters(self):
+        self.weight_linear.reset_parameters()
+        if self.conv_bias is not None:
+            nn.init.constant_(self.conv_bias, 0.0)
+
+    def forward(self, x, incremental_state=None, query=None, unfold=None):
+        """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
+        args:
+            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
+            incremental_state: A dict to keep the state
+            unfold: unfold the input or not. If not, we use the matrix trick instead
+            query: use the specified query to predict the conv filters
+        """
+        unfold = (
+            x.size(0) > 512 if unfold is None else unfold
+        )  # use unfold mode as default for long sequence to save memory
+        unfold = unfold or (incremental_state is not None)
+        assert query is None or not self.in_proj
+
+        if query is None:
+            query = x
+        if unfold:
+            output = self._forward_unfolded(x, incremental_state, query)
+        else:
+            output = self._forward_expanded(x, incremental_state, query)
+
+        if self.conv_bias is not None:
+            output = output + self.conv_bias.view(1, 1, -1)
+        return output
+
+    def _forward_unfolded(self, x, incremental_state, query):
+        """The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right."""
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        if self.in_proj:
+            proj = self.weight_linear(x)
+            x = proj.narrow(2, 0, self.input_size).contiguous()
+            weight = (
+                proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1)
+            )
+        else:
+            weight = self.weight_linear(query).view(T * B * H, -1)
+
+        # renorm_padding is only implemented in _forward_expanded
+        assert not self.renorm_padding or incremental_state is not None
+
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(
+                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
+                )
+            x_unfold = x_unfold.view(T * B * H, R, -1)
+        else:
+            padding_l = self.padding_l
+            if K > T and padding_l == K - 1:
+                weight = weight.narrow(1, K - T, T)
+                K, padding_l = T, T - 1
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, K, padding_l, 0)
+            x_unfold = x_unfold.view(T * B * H, R, K)
+
+        if self.weight_softmax and not self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+        weight = weight.narrow(1, 0, K)
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2) :]
+            K = weight.size(1)
+
+        if self.weight_softmax and self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+
+        weight = self.weight_dropout_module(weight, inplace=False)
+
+        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_stat, query):
+        """Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        """
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+        if self.in_proj:
+            proj = self.weight_linear(x)
+            x = proj.narrow(2, 0, self.input_size).contiguous()
+            weight = (
+                proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1)
+            )
+        else:
+            weight = self.weight_linear(query).view(T * B * H, -1)
+
+        if not self.renorm_padding:
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=1)
+            weight = self.weight_dropout_module(weight, inplace=False)
+        weight = weight.narrow(1, 0, K).contiguous()
+        weight = weight.view(T, B * H, K).transpose(0, 1)
+
+        x = x.view(T, B * H, R).transpose(0, 1)
+        if self.weight_softmax and self.renorm_padding:
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf"))
+            weight_expanded.as_strided(
+                (B * H, T, K), (T * (T + K - 1), T + K, 1)
+            ).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
+            # normalize the weight over valid positions like self-attention
+            weight_expanded = F.softmax(weight_expanded, dim=2)
+            weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
+        else:
+            P = self.padding_l
+            # For efficiency, we cut the kernel size and reduce the padding when the kernel is larger than the length
+            if K > T and P == K - 1:
+                weight = weight.narrow(2, K - T, T)
+                K, P = T, T - 1
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
+            weight_expanded.as_strided(
+                (B * H, T, K), (T * (T + K - 1), T + K, 1)
+            ).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, "input_buffer")
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(
+            self, incremental_state, "input_buffer", new_buffer
+        )
+
+    def extra_repr(self):
+        s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format(
+            self.input_size,
+            self.kernel_size,
+            self.padding_l,
+            self.num_heads,
+            self.weight_softmax,
+            self.conv_bias is not None,
+            self.renorm_padding,
+            self.in_proj,
+        )
+
+        if self.query_size != self.input_size:
+            s += ", query_size={}".format(self.query_size)
+        if self.weight_dropout_module.p > 0.0:
+            s += ", weight_dropout={}".format(self.weight_dropout_module.p)
+        return s
+
+
+class DynamicConv_scripatable(nn.Module, FairseqIncrementalState):
+    """Dynamic lightweight convolution taking T x B x C inputs
+    Args:
+        input_size: # of channels of the input
+        kernel_size: convolution channels
+        padding_l: padding to the left when using "same" padding
+        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
+        weight_dropout: the drop rate of the DropConnect to drop the weight
+        weight_softmax: normalize the weight with softmax before the convolution
+        renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1)
+        bias: use bias
+        conv_bias: bias of the convolution
+        query_size: specified when feeding a different input as the query
+        in_proj: project the input and generate the filter together
+
+    Shape:
+        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
+        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias:   the learnable bias of the module of shape `(input_size)`
+    """
+
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding_l=None,
+        num_heads=1,
+        weight_dropout=0.0,
+        weight_softmax=False,
+        renorm_padding=False,
+        bias=False,
+        conv_bias=False,
+        query_size=None,
+        in_proj=False,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.query_size = input_size if query_size is None else query_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+        self.weight_softmax = weight_softmax
+        self.renorm_padding = renorm_padding
+
+        if in_proj:
+            self.weight_linear = Linear(
+                self.input_size, self.input_size + num_heads * kernel_size * 1
+            )
+        else:
+            self.weight_linear = Linear(
+                self.query_size, num_heads * kernel_size * 1, bias=bias
+            )
+        self.in_proj = (
+            self.weight_linear.out_features
+            == self.input_size + self.num_heads * self.kernel_size
+        )
+        self.has_conv_bias = conv_bias
+        self.conv_bias = nn.Parameter(torch.Tensor(input_size).view(1, 1, -1))
+        self.init_incremental_state()
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.weight_linear.reset_parameters()
+        if self.has_conv_bias:
+            nn.init.constant_(self.conv_bias, 0.0)
+
+    def forward(
+        self,
+        x,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        query: Optional[Tensor] = None,
+    ):
+        """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
+        args:
+            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
+            incremental_state: A dict to keep the state
+            unfold: unfold the input or not. If not, we use the matrix trick instead
+            query: use the specified query to predict the conv filters
+        """
+        assert query is None or not self.in_proj
+
+        if query is None:
+            query = x
+
+        output = self._forward_unfolded(x, incremental_state, query)
+
+        if self.has_conv_bias:
+            output = output + self.conv_bias
+        return output
+
+    def _forward_unfolded(
+        self,
+        x,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        query,
+    ):
+        """The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right."""
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        TxBxH = T * B * H
+
+        if self.in_proj:
+            proj = self.weight_linear(x)
+            x = proj.narrow(2, 0, self.input_size).contiguous()
+            weight = proj.narrow(2, self.input_size, H * K).contiguous().view(TxBxH, -1)
+        else:
+            weight = self.weight_linear(query).view(TxBxH, -1)
+
+        # renorm_padding is only implemented in _forward_expanded
+        assert not self.renorm_padding or incremental_state is not None
+
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is not None:
+                x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            else:
+                x_unfold = x.unsqueeze(3).clone()
+            if self.kernel_size > 1:
+                self._set_input_buffer(
+                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
+                )
+            x_unfold = x_unfold.view(TxBxH, R, -1)
+        else:
+            padding_l = self.padding_l
+            if K > T and padding_l == K - 1:
+                weight = weight.narrow(1, K - T, T)
+                K, padding_l = T, T - 1
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, K, padding_l, 0.0)
+            x_unfold = x_unfold.view(TxBxH, R, K)
+
+        if self.weight_softmax and not self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+        weight = weight.narrow(1, 0, K)
+
+        if incremental_state is not None:
+            weight = weight[:, -(x_unfold.size(2)) :]
+            K = weight.size(1)
+
+        if self.weight_softmax and self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+
+        weight = self.weight_dropout_module(weight, inplace=False)
+
+        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T x B x H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def reorder_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order: Tensor,
+    ):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ):
+        result = self.get_incremental_state(incremental_state, "input_buffer")
+        if result is not None and "input_buffer" in result:
+            return result["input_buffer"]
+        else:
+            return None
+
+    def _set_input_buffer(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_buffer: Optional[Tensor],
+    ):
+        result = self.set_incremental_state(
+            incremental_state, "input_buffer", {"input_buffer": new_buffer}
+        )
+        if result is not None:
+            incremental_state = result
+        return incremental_state
+
+    def extra_repr(self):
+        s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format(  # noqa
+            self.input_size,
+            self.kernel_size,
+            self.padding_l,
+            self.num_heads,
+            self.weight_softmax,
+            self.conv_bias is not None,
+            self.renorm_padding,
+            self.in_proj,
+        )
+
+        if self.query_size != self.input_size:
+            s += ", query_size={}".format(self.query_size)
+        if self.weight_dropout_module.p > 0.0:
+            s += ", weight_dropout={}".format(self.weight_dropout_module.p)
+        return s
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
new file mode 100644
index 0000000000..8fcc6b8d26
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
@@ -0,0 +1,189 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file is to re-implemented the low-rank and beam approximation of CRF layer
+Proposed by:
+
+Sun, Zhiqing, et al.
+Fast Structured Decoding for Sequence Models
+https://arxiv.org/abs/1910.11555
+
+The CRF implementation is mainly borrowed from
+https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py
+
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def logsumexp(x, dim=1):
+    return torch.logsumexp(x.float(), dim=dim).type_as(x)
+
+
+class DynamicCRF(nn.Module):
+    """Dynamic CRF layer is used to approximate the traditional
+    Conditional Random Fields (CRF)
+    $P(y | x) = 1/Z(x) exp(sum_i s(y_i, x) + sum_i t(y_{i-1}, y_i, x))$
+
+    where in this function, we assume the emition scores (s) are given,
+    and the transition score is a |V| x |V| matrix $M$
+
+    in the following two aspects:
+     (1) it used a low-rank approximation for the transition matrix:
+         $M = E_1 E_2^T$
+     (2) it used a beam to estimate the normalizing factor Z(x)
+    """
+
+    def __init__(self, num_embedding, low_rank=32, beam_size=64):
+        super().__init__()
+
+        self.E1 = nn.Embedding(num_embedding, low_rank)
+        self.E2 = nn.Embedding(num_embedding, low_rank)
+
+        self.vocb = num_embedding
+        self.rank = low_rank
+        self.beam = beam_size
+
+    def extra_repr(self):
+        return "vocab_size={}, low_rank={}, beam_size={}".format(
+            self.vocb, self.rank, self.beam
+        )
+
+    def forward(self, emissions, targets, masks, beam=None):
+        """
+        Compute the conditional log-likelihood of a sequence of target tokens given emission scores
+
+        Args:
+            emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
+                ``(batch_size, seq_len, vocab_size)``. We assume batch-first
+            targets (`~torch.LongTensor`): Sequence of target token indices
+                ``(batch_size, seq_len)
+            masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
+
+        Returns:
+            `~torch.Tensor`: approximated log-likelihood
+        """
+        numerator = self._compute_score(emissions, targets, masks)
+        denominator = self._compute_normalizer(emissions, targets, masks, beam)
+        return numerator - denominator
+
+    def forward_decoder(self, emissions, masks=None, beam=None):
+        """
+        Find the most likely output sequence using Viterbi algorithm.
+
+        Args:
+            emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
+                ``(batch_size, seq_len, vocab_size)``. We assume batch-first
+            masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
+
+        Returns:
+            `~torch.LongTensor`: decoded sequence from the CRF model
+        """
+        return self._viterbi_decode(emissions, masks, beam)
+
+    def _compute_score(self, emissions, targets, masks=None):
+        batch_size, seq_len = targets.size()
+        emission_scores = emissions.gather(2, targets[:, :, None])[:, :, 0]  # B x T
+        transition_scores = (self.E1(targets[:, :-1]) * self.E2(targets[:, 1:])).sum(2)
+
+        scores = emission_scores
+        scores[:, 1:] += transition_scores
+
+        if masks is not None:
+            scores = scores * masks.type_as(scores)
+        return scores.sum(-1)
+
+    def _compute_normalizer(self, emissions, targets=None, masks=None, beam=None):
+        # HACK: we include "target" which is a hueristic for training
+        # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
+
+        beam = beam if beam is not None else self.beam
+        batch_size, seq_len = emissions.size()[:2]
+        if targets is not None:
+            _emissions = emissions.scatter(2, targets[:, :, None], np.float("inf"))
+            beam_targets = _emissions.topk(beam, 2)[1]
+            beam_emission_scores = emissions.gather(2, beam_targets)
+        else:
+            beam_emission_scores, beam_targets = emissions.topk(beam, 2)
+        beam_transition_score1 = self.E1(beam_targets[:, :-1])  # B x (T-1) x K x D
+        beam_transition_score2 = self.E2(beam_targets[:, 1:])  # B x (T-1) x K x D
+        beam_transition_matrix = torch.bmm(
+            beam_transition_score1.view(-1, beam, self.rank),
+            beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2),
+        )
+        beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
+
+        # compute the normalizer in the log-space
+        score = beam_emission_scores[:, 0]  # B x K
+        for i in range(1, seq_len):
+            next_score = score[:, :, None] + beam_transition_matrix[:, i - 1]
+            next_score = logsumexp(next_score, dim=1) + beam_emission_scores[:, i]
+
+            if masks is not None:
+                score = torch.where(masks[:, i : i + 1], next_score, score)
+            else:
+                score = next_score
+
+        # Sum (log-sum-exp) over all possible tags
+        return logsumexp(score, dim=1)
+
+    def _viterbi_decode(self, emissions, masks=None, beam=None):
+        # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
+
+        beam = beam if beam is not None else self.beam
+        batch_size, seq_len = emissions.size()[:2]
+        beam_emission_scores, beam_targets = emissions.topk(beam, 2)
+        beam_transition_score1 = self.E1(beam_targets[:, :-1])  # B x (T-1) x K x D
+        beam_transition_score2 = self.E2(beam_targets[:, 1:])  # B x (T-1) x K x D
+        beam_transition_matrix = torch.bmm(
+            beam_transition_score1.view(-1, beam, self.rank),
+            beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2),
+        )
+        beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
+
+        traj_tokens, traj_scores = [], []
+        finalized_tokens, finalized_scores = [], []
+
+        # compute the normalizer in the log-space
+        score = beam_emission_scores[:, 0]  # B x K
+        dummy = (
+            torch.arange(beam, device=score.device).expand(*score.size()).contiguous()
+        )
+
+        for i in range(1, seq_len):
+            traj_scores.append(score)
+            _score = score[:, :, None] + beam_transition_matrix[:, i - 1]
+            _score, _index = _score.max(dim=1)
+            _score = _score + beam_emission_scores[:, i]
+
+            if masks is not None:
+                score = torch.where(masks[:, i : i + 1], _score, score)
+                index = torch.where(masks[:, i : i + 1], _index, dummy)
+            else:
+                score, index = _score, _index
+            traj_tokens.append(index)
+
+        # now running the back-tracing and find the best
+        best_score, best_index = score.max(dim=1)
+        finalized_tokens.append(best_index[:, None])
+        finalized_scores.append(best_score[:, None])
+
+        for idx, scs in zip(reversed(traj_tokens), reversed(traj_scores)):
+            previous_index = finalized_tokens[-1]
+            finalized_tokens.append(idx.gather(1, previous_index))
+            finalized_scores.append(scs.gather(1, previous_index))
+
+        finalized_tokens.reverse()
+        finalized_tokens = torch.cat(finalized_tokens, 1)
+        finalized_tokens = beam_targets.gather(2, finalized_tokens[:, :, None])[:, :, 0]
+
+        finalized_scores.reverse()
+        finalized_scores = torch.cat(finalized_scores, 1)
+        finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
+
+        return finalized_scores, finalized_tokens
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
new file mode 100644
index 0000000000..22dc6f403d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dynamicconv_layer import DynamicconvLayer  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000..9304f99eb8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -0,0 +1,223 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def gen_forward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    blocks = [32, 64, 128, 256]
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector<at::Tensor> dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = weight.size(1);
+    const auto filterSize = weight.size(2);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+    const dim3 blocks(minibatch, numFeatures);
+
+    auto output = at::zeros_like(input);
+    auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+    switch = """
+    switch(filterSize) {
+"""
+
+    case_k = """
+        case {k}:
+"""
+
+    main_block = """
+            if (padding_l == {pad}) {{
+                AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{
+                    dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+                    <<<blocks, {b_size}, 0, stream>>>(
+                            input.data<scalar_t>(),
+                            weight.data<scalar_t>(),
+                            minibatch,
+                            sequenceLength,
+                            numFeatures,
+                            numFiltersInBlock,
+                            numHeads,
+                            output.data<scalar_t>());
+                }}));
+            }} else
+"""
+
+    bad_padding = """
+            {
+                std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+            }
+            break;\n
+"""
+
+    end = """
+        default:
+            std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+    }
+
+    return {output};
+}
+"""
+
+    with open("dynamicconv_cuda_forward.cu", "w") as forward:
+        forward.write(head)
+        forward.write(switch)
+        for k in kernels:
+            b_size = 32
+            for b in blocks:
+                if b > k:
+                    b_size = b
+                    break
+            forward.write(case_k.format(k=k))
+            for pad in [k // 2, k - 1]:
+                forward.write(main_block.format(k=k, b_size=b_size, pad=pad))
+            forward.write(bad_padding)
+        forward.write(end)
+
+
+def gen_backward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    thresh = [512, 512, 512, 512, 512, 380, 256, 256]
+    min_block = [64, 64, 64, 64, 64, 64, 128, 256]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector<at::Tensor> dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = weight.size(1);
+    const auto filterSize = weight.size(2);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+    auto numChunks = 1;
+
+    auto gradInput = at::zeros_like(input);
+    auto gradWeight = at::zeros_like(weight);
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    dim3 blocks(minibatch, numHeads, numChunks);
+"""
+
+    sequence_if = """
+    if (sequenceLength < {seq}) {{
+        switch(filterSize) {{
+"""
+
+    case_k = """
+            case {k}:
+"""
+
+    chunks_reset = """
+                numChunks = int(ceilf(sequenceLength/float({b_size})));
+                blocks = dim3(minibatch, numHeads, numChunks);
+"""
+
+    main_block = """
+                if (padding_l == {p}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{
+                        dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<blocks, {b_size}, 0, stream>>>(
+                                    gradOutput.data<scalar_t>(),
+                                    input.data<scalar_t>(),
+                                    weight.data<scalar_t>(),
+                                    minibatch,
+                                    sequenceLength,
+                                    numFeatures,
+                                    numFiltersInBlock,
+                                    numHeads,
+                                    gradWeight.data<scalar_t>(),
+                                    gradInput.data<scalar_t>());
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+                }
+                break;\n
+"""
+
+    bad_filter = """
+            default:
+                std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+        }
+"""
+
+    con_else = """
+    } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    last_return = """
+    }
+    return {gradInput, gradWeight};
+}
+"""
+
+    with open("dynamicconv_cuda_backward.cu", "w") as backward:
+        backward.write(head)
+        for seq in seqs:
+            backward.write(sequence_if.format(seq=seq))
+            for k, t, m in zip(kernels, thresh, min_block):
+                backward.write(case_k.format(k=k))
+                if seq <= t:
+                    b_size = seq
+                else:
+                    b_size = m
+                    backward.write(chunks_reset.format(b_size=b_size))
+                for p in [k // 2, k - 1]:
+                    backward.write(main_block.format(k=k, b_size=b_size, p=p))
+                backward.write(bad_padding)
+            backward.write(bad_filter)
+            backward.write(con_else)
+        backward.write(final_else)
+        for k, m in zip(kernels, min_block):
+            backward.write(case_k.format(k=k))
+            backward.write(chunks_reset.format(b_size=m))
+            for p in [k // 2, k - 1]:
+                backward.write(main_block.format(k=k, b_size=m, p=p))
+            backward.write(bad_padding)
+        backward.write(bad_filter)
+        backward.write(last_return)
+
+
+if __name__ == "__main__":
+    gen_forward()
+    gen_backward()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
new file mode 100644
index 0000000000..744c363e55
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor>
+dynamicconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l);
+
+std::vector<at::Tensor> dynamicconv_cuda_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters);
+
+#define CHECK_CUDA(x) \
+  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor>
+dynamicconv_forward(at::Tensor input, at::Tensor filters, int padding_l) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(filters);
+
+  return dynamicconv_cuda_forward(input, filters, padding_l);
+}
+
+std::vector<at::Tensor> dynamicconv_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters) {
+  CHECK_INPUT(gradOutput);
+  CHECK_INPUT(input);
+  CHECK_INPUT(filters);
+
+  return dynamicconv_cuda_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)");
+  m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)");
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
new file mode 100644
index 0000000000..44baf21bdd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
@@ -0,0 +1,50 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+
+#define SHFL_MASK 0xffffffff
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void dynamicconv_forward_kernel(
+    const scalar_t* input,
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* output);
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void dynamicconv_backward_kernel(
+    const scalar_t* gradOutput, // B * C * T
+    const scalar_t* input, // B * C * T
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* gradWeight,
+    scalar_t* gradInput); // B * H * k * T
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
new file mode 100644
index 0000000000..4630f1e982
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../cuda_utils.cu"
+#include "dynamicconv_cuda.cuh"
+#include "dynamicconv_cuda_backward.cu"
+#include "dynamicconv_cuda_forward.cu"
+
+// FS is filter size and kernels are specialized for filter sizes
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void dynamicconv_forward_kernel(
+    const scalar_t* input,
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* output) {
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int head = featureIdx / numFiltersInBlock;
+
+  const int IOOffset =
+      batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+
+  scalar_t filter[FS];
+
+  __shared__ scalar_t tempInput[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    __syncthreads();
+    const int inputOffset = i * SB;
+    load_input_to_shared<FS, SB, padding_l>(
+        inputFeature,
+        inputOffset,
+        sequenceLength,
+        i,
+        numIterations,
+        false,
+        tempInput);
+    __syncthreads();
+    if (inputOffset + tid < sequenceLength) {
+#pragma unroll
+      for (int k = 0; k < FS; ++k) {
+        const int filterOffset = batchIdx * numHeads * FS * sequenceLength +
+            head * FS * sequenceLength + k * sequenceLength + i * SB + tid;
+        filter[k] = weight[filterOffset];
+      }
+
+      scalar_t out = scalar_t(0.0);
+#pragma unroll
+      for (int k = 0; k < FS; ++k) {
+        out += filter[k] * tempInput[tid + k];
+      }
+
+      outputFeature[inputOffset + tid] = out;
+    }
+  }
+}
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void dynamicconv_backward_kernel(
+    const scalar_t* gradOutput, // B * C * T
+    const scalar_t* input, // B * C * T
+    const scalar_t* weight,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    scalar_t* gradWeight,
+    scalar_t* gradInput) { // B * H * k * T
+
+  assert(blockDim.x == SB);
+
+  // each block operates on a single batch and filter head
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int headIdx = blockIdx.y;
+  const int chunkIdx = blockIdx.z;
+
+  const int numChunks = divUp<int, int>(sequenceLength, SB);
+  const int inputOffset = chunkIdx * SB;
+
+  // initialize shared memory for output gradient and input
+  __shared__ scalar_t tempGradOutput[SB + FS];
+  __shared__ scalar_t tempInput[SB + FS];
+  const int padding = FS - padding_l - 1;
+
+  zeroSharedMem<FS, SB, padding>(tempGradOutput);
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+
+  // initialize local filter and weight gradient sum arrays
+  scalar_t tempGradSum[FS];
+  scalar_t bfilter[FS];
+  for (int k = 0; k < FS; ++k) {
+    tempGradSum[k] = scalar_t(0.0);
+
+    int idxOffset = inputOffset + tid + k - padding;
+    if (idxOffset >= 0 && idxOffset < sequenceLength) {
+      int bfilterOffset = batchIdx * numHeads * FS * sequenceLength +
+          headIdx * FS * sequenceLength + (FS - k - 1) * sequenceLength +
+          idxOffset;
+      bfilter[k] = weight[bfilterOffset];
+    } else {
+      bfilter[k] = scalar_t(0.0);
+    }
+  }
+
+  // iterate over filter block
+  for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) {
+    __syncthreads();
+
+    // load input and output gradient for this channel and chunk
+    const int IOOffset = batchIdx * numFeatures * sequenceLength +
+        (headIdx * numFiltersInBlock + featureIdx) * sequenceLength;
+    const scalar_t* inputFeature = &input[IOOffset];
+    const scalar_t* gradOutputFeature = &gradOutput[IOOffset];
+    scalar_t* gradInputFeature = &gradInput[IOOffset];
+
+    load_input_to_shared<FS, SB, padding>(
+        gradOutputFeature,
+        inputOffset,
+        sequenceLength,
+        chunkIdx,
+        numChunks,
+        true,
+        tempGradOutput);
+    load_input_to_shared<FS, SB, padding_l>(
+        inputFeature,
+        inputOffset,
+        sequenceLength,
+        chunkIdx,
+        numChunks,
+        true,
+        tempInput);
+    __syncthreads();
+
+    // sum input and weight gradients
+    scalar_t out = scalar_t(0.0);
+#pragma unroll
+    for (int k = 0; k < FS; ++k) {
+      tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding];
+      out += bfilter[k] * tempGradOutput[tid + k];
+    }
+
+    if (inputOffset + tid < sequenceLength) {
+      gradInputFeature[inputOffset + tid] = out;
+    }
+  }
+
+  const int gradOffset =
+      batchIdx * numHeads * FS * sequenceLength + headIdx * FS * sequenceLength;
+  scalar_t* gradWeightFeature = &gradWeight[gradOffset];
+
+  // write weight gradient
+  if (inputOffset + tid < sequenceLength) {
+    for (int k = 0; k < FS; ++k) {
+      const int outputOffset = k * sequenceLength + inputOffset + tid;
+      gradWeightFeature[outputOffset] = tempGradSum[k];
+    }
+  }
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
new file mode 100644
index 0000000000..711ed03483
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -0,0 +1,227 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import dynamicconv_cuda
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.unfold import unfold1d
+from torch import nn
+from torch.autograd import Function
+
+
+class dynamicconvFunction(Function):
+    @staticmethod
+    def forward(ctx, x, weights, padding_l):
+        ctx.padding_l = padding_l
+        outputs = dynamicconv_cuda.forward(x, weights, padding_l)
+        variables = [x, weights]
+        ctx.save_for_backward(*variables)
+        return outputs[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        outputs = dynamicconv_cuda.backward(
+            grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors
+        )
+        grad_input, grad_weights = outputs
+        return grad_input, grad_weights, None
+
+
+@with_incremental_state
+class DynamicconvLayer(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding_l=None,
+        weight_softmax=False,
+        num_heads=1,
+        weight_dropout=0.0,
+        bias=False,
+        renorm_padding=False,
+        conv_bias=False,
+        query_size=None,
+    ):
+
+        super(DynamicconvLayer, self).__init__()
+        self.input_size = input_size
+        self.query_size = input_size if query_size is None else query_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_softmax = weight_softmax
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+        self.renorm_padding = renorm_padding
+        self.bias = bias
+
+        self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias)
+        if conv_bias:
+            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.conv_bias = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight_linear.weight)
+        if self.conv_bias is not None:
+            nn.init.constant_(self.conv_bias, 0.0)
+            nn.init.constant_(self.weight_linaer.bias, 0.0)
+
+    def forward(self, x, incremental_state=None, query=None, unfold=None):
+
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        # R = C // H
+
+        # during inference time, incremental BMM is faster
+        if incremental_state is not None:
+            unfold = (
+                x.size(0) > 512 if unfold is None else unfold
+            )  # use unfold mode as default for long sequence to save memory
+            unfold = unfold or (incremental_state is not None)
+            assert query is None
+
+            if query is None:
+                query = x
+            if unfold:
+                output = self._forward_unfolded(x, incremental_state, query)
+            else:
+                output = self._forward_expanded(x, incremental_state, query)
+
+            if self.conv_bias is not None:
+                output = output + self.conv_bias.view(1, 1, -1)
+
+            return output
+
+        # during training time, use CUDA kernel
+        else:
+            weight = self.weight_linear(x).view(T, B, H, K)
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=-1)
+            if self.weight_dropout_module.p:
+                weight = self.weight_dropout_module(weight)
+
+            weight = weight.permute(1, 2, 3, 0).contiguous()
+            self.filters = weight
+            x = x.permute(1, 2, 0).contiguous()
+            output = dynamicconvFunction.apply(x, weight, self.padding_l).permute(
+                2, 0, 1
+            )
+            if self.conv_bias is not None:
+                output = output + self.conv_bias.view(1, 1, -1)
+            return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, "input_buffer")
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(
+            self, incremental_state, "input_buffer", new_buffer
+        )
+
+    def _forward_unfolded(self, x, incremental_state, query):
+        """The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right."""
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight_linear(query).view(T * B * H, -1)
+
+        # renorm_padding is only implemented in _forward_expanded
+        assert not self.renorm_padding or incremental_state is not None
+
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(
+                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
+                )
+            x_unfold = x_unfold.view(T * B * H, R, -1)
+        else:
+            padding_l = self.padding_l
+            if K > T and padding_l == K - 1:
+                weight = weight.narrow(1, K - T, T)
+                K, padding_l = T, T - 1
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, K, padding_l, 0)
+            x_unfold = x_unfold.view(T * B * H, R, K)
+
+        if self.weight_softmax and not self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+        weight = weight.narrow(1, 0, K)
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2) :]
+            K = weight.size(1)
+
+        if self.weight_softmax and self.renorm_padding:
+            weight = F.softmax(weight, dim=1)
+
+        weight = self.weight_dropout_module(weight, inplace=False)
+
+        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_stat, query):
+        """Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        """
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+        weight = self.weight_linear(query).view(T * B * H, -1)
+
+        if not self.renorm_padding:
+            if self.weight_softmax:
+                weight = F.softmax(weight, dim=1)
+            weight = self.weight_dropout_module(weight, inplace=False)
+        weight = weight.narrow(1, 0, K).contiguous()
+        weight = weight.view(T, B * H, K).transpose(0, 1)
+
+        x = x.view(T, B * H, R).transpose(0, 1)
+        if self.weight_softmax and self.renorm_padding:
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf"))
+            weight_expanded.as_strided(
+                (B * H, T, K), (T * (T + K - 1), T + K, 1)
+            ).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
+            # normalize the weight over valid positions like self-attention
+            weight_expanded = F.softmax(weight_expanded, dim=2)
+            weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
+        else:
+            P = self.padding_l
+            # For efficiency, we cut the kernel size and reduce the padding when the kernel is larger than the length
+            if K > T and P == K - 1:
+                weight = weight.narrow(2, K - T, T)
+                K, P = T, T - 1
+            # turn the convolution filters into band matrices
+            weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
+            weight_expanded.as_strided(
+                (B * H, T, K), (T * (T + K - 1), T + K, 1)
+            ).copy_(weight)
+            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
new file mode 100644
index 0000000000..d7e57c8590
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
@@ -0,0 +1,29 @@
+#include <torch/torch.h>
+#include <vector>
+
+std::vector<float*>
+dynamicconv_cpu_forward(float* input, float* filters, int padding_l);
+
+std::vector<float*> dynamicconv_cpu_backward(
+    float* gradOutput,
+    int padding_l,
+    float* input,
+    float* filters);
+
+std::vector<float*>
+dynamicconv_forward(float* input, float* filters, int padding_l) {
+  return dynamicconv_cpu_forward(input, filters, padding_l);
+}
+
+std::vector<float*> dynamicconv_backward(
+    float* gradOutput,
+    int padding_l,
+    float* input,
+    float* filters) {
+  return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)");
+  m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)");
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
new file mode 100644
index 0000000000..6a21f7e2ee
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+setup(
+    name="dynamicconv_layer",
+    ext_modules=[
+        CUDAExtension(
+            name="dynamicconv_cuda",
+            sources=[
+                "dynamicconv_cuda.cpp",
+                "dynamicconv_cuda_kernel.cu",
+            ],
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py
new file mode 100644
index 0000000000..f0ece842d4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+
+"""
+Used for EMA tracking a given pytorch module. The user is responsible for calling step()
+and setting the appropriate decay
+"""
+
+import copy
+from dataclasses import dataclass, field
+import logging
+
+import torch
+
+from omegaconf import II
+from fairseq.dataclass import FairseqDataclass
+
+try:
+    from amp_C import multi_tensor_l2norm
+
+    multi_tensor_l2norm_available = True
+except ImportError:
+    multi_tensor_l2norm_available = False
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class EMAModuleConfig(FairseqDataclass):
+    ema_decay: float = field(
+        default=0.9999, metadata={"help": "decay for exponential moving average model"}
+    )
+    ema_fp32: bool = field(
+        default=False,
+        metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"},
+    )
+    add_missing_params: bool = True
+    log_norms: bool = False
+
+
+class EMAModule:
+    """Exponential Moving Average of Fairseq Models"""
+
+    def __init__(
+        self,
+        model,
+        config: EMAModuleConfig,
+        copy_model=True,
+        device=None,
+        skip_keys=None,
+    ):
+        """
+        @param model model to initialize the EMA with
+        @param config EMAConfig object with configuration like
+        ema_decay, ema_update_freq, ema_fp32
+        @param device If provided, copy EMA to this device (e.g. gpu).
+        Otherwise EMA is in the same device as the model.
+        """
+
+        self.config = config
+
+        if copy_model:
+            self.model = copy.deepcopy(model)
+            self.model.requires_grad_(False)
+        else:
+            self.model = model
+
+        self.config = config
+        self.decay = config.ema_decay
+        self.skip_keys = skip_keys or set()
+        self.add_missing_params = config.add_missing_params
+        self.fp32_params = {}
+
+        if device is not None:
+            logging.info(f"Copying EMA model to device {device}")
+            self.model = self.model.to(device=device)
+
+        if self.config.ema_fp32:
+            self.build_fp32_params()
+
+        self.log_norms = config.log_norms and multi_tensor_l2norm_available
+        self.logs = {}
+
+    def build_fp32_params(self, state_dict=None):
+        """
+        Store a copy of the EMA params in fp32.
+        If state dict is passed, the EMA params is copied from
+        the provided state dict. Otherwise, it is copied from the
+        current EMA model parameters.
+        """
+        if not self.config.ema_fp32:
+            raise RuntimeError(
+                "build_fp32_params should not be called if ema_fp32=False. "
+                "Use ema_fp32=True if this is really intended."
+            )
+
+        if state_dict is None:
+            state_dict = self.model.state_dict()
+
+        def _to_float(t):
+            return t.float() if torch.is_floating_point(t) else t
+
+        for param_key in state_dict:
+            if param_key in self.fp32_params:
+                if param_key == "__sq_mom":
+                    self.fp32_params[param_key] = state_dict[param_key]
+                else:
+                    self.fp32_params[param_key].copy_(state_dict[param_key])
+            else:
+                self.fp32_params[param_key] = _to_float(state_dict[param_key])
+                if "__sq_mom" in self.fp32_params:
+                    self.fp32_params["__sq_mom"][param_key] = torch.zeros_like(
+                        self.fp32_params[param_key]
+                    )
+
+    def restore(self, state_dict, build_fp32_params=False):
+        """Load data from a model spec into EMA model"""
+        self.model.load_state_dict(state_dict, strict=False)
+        if build_fp32_params:
+            self.build_fp32_params(state_dict)
+
+    def set_decay(self, decay, weight_decay=None):
+        self.decay = decay
+        if weight_decay is not None:
+            self.weight_decay = weight_decay
+
+    def get_decay(self):
+        return self.decay
+
+    def _step_internal(self, new_model):
+        """One update of the EMA model based on new model weights"""
+        decay = self.decay
+
+        ema_state_dict = {}
+        ema_params = (
+            self.fp32_params if self.config.ema_fp32 else self.model.state_dict()
+        )
+
+        new_p = []
+        ema_p = []
+
+        for key, param in new_model.named_parameters():
+            if isinstance(param, dict):
+                continue
+
+            if not self.add_missing_params and key not in ema_params:
+                continue
+
+            try:
+                ema_param = ema_params[key]
+            except KeyError:
+                ema_param = (
+                    param.float().clone() if param.ndim == 1 else copy.deepcopy(param)
+                )
+                ema_params[key] = ema_param
+
+            if param.shape != ema_param.shape:
+                raise ValueError(
+                    "incompatible tensor shapes between model param and ema param"
+                    + "{} vs. {}".format(param.shape, ema_param.shape)
+                )
+
+            if "version" in key:
+                # Do not decay a model.version pytorch param
+                continue
+
+            lr = 1 - decay
+
+            if key in self.skip_keys or not param.requires_grad:
+                ema_params[key].copy_(param.to(dtype=ema_param.dtype).data)
+                ema_param = ema_params[key]
+            else:
+                if self.log_norms:
+                    new_p.append(param)
+                    ema_p.append(ema_param)
+
+                ema_param.mul_(1 - lr)
+                ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=lr)
+
+            ema_state_dict[key] = ema_param
+
+        for key, param in new_model.named_buffers():
+            ema_state_dict[key] = param
+
+        if self.log_norms:
+            if "model_norm" in self.logs:
+                self.prev_model_norm = self.logs["model_norm"]
+
+            chunk_size = 2048 * 32
+            has_inf = torch.zeros(
+                (1, 1), dtype=torch.int, device=next(new_model.parameters()).device
+            )
+
+            new_norm = multi_tensor_l2norm(chunk_size, has_inf, [new_p], False)
+            old_norm = multi_tensor_l2norm(chunk_size, has_inf, [ema_p], False)
+
+            self.logs["model_norm"] = new_norm[0]
+            self.logs["ema_norm"] = old_norm[0]
+
+        self.restore(ema_state_dict, build_fp32_params=False)
+
+    @torch.no_grad()
+    def step(self, new_model):
+        self._step_internal(new_model)
+
+    def reverse(self, model):
+        """
+        Load the model parameters from EMA model.
+        Useful for inference or fine-tuning from the EMA model.
+        """
+        d = self.model.state_dict()
+        if "_ema" in d:
+            del d["_ema"]
+
+        model.load_state_dict(d, strict=False)
+        return model
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py
new file mode 100644
index 0000000000..82bc0d7b45
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Multi-Head Attention layer definition."""
+
+import math
+
+import torch
+from torch import nn
+
+from fairseq.modules.rotary_positional_embedding import (
+    RotaryPositionalEmbedding,
+    apply_rotary_pos_emb,
+)
+
+
+class ESPNETMultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head: The number of heads.
+        n_feat: The number of features.
+        dropout: Dropout rate.
+    """
+
+    def __init__(self, n_feat, n_head, dropout):
+        """Construct an MultiHeadedAttention object."""
+        super(ESPNETMultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward_qkv(self, query, key, value, **kwargs):
+        """Transform query, key and value.
+        Args:
+            query: Query tensor  B X T1 X C
+            key: Key tensor B X T2 X C
+            value: Value tensor  B X T2 X C
+        Returns:
+            torch.Tensor: Transformed query tensor  B X n_head X T1 X d_k
+            torch.Tensor: Transformed key tensor B X n_head X T2 X d_k
+            torch.Tensor: Transformed value tensor  B X n_head X T2 X d_k
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+
+    def forward_attention(self, value, scores, mask):
+        """Compute attention context vector.
+        Args:
+            value: Transformed value B X n_head X T2 X d_k.
+            scores: Attention score  B X n_head X T1 X T2
+            mask: Mask  T2 X B
+        Returns:
+            torch.Tensor: Transformed value  B X T1 X d_model
+                weighted by the attention score  B X T1 X T2
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            scores = scores.masked_fill(
+                mask.unsqueeze(1).unsqueeze(2).to(bool),
+                float("-inf"),  # (batch, head, time1, time2)
+            )
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self, query, key, value, key_padding_mask=None, **kwargs):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor T X B X C
+            key (torch.Tensor): Key tensor T X B X C
+            value (torch.Tensor): Value tensor T X B X C
+            mask (torch.Tensor): Mask tensor T X B
+        Returns:
+            torch.Tensor: Output tensor T X B X D.
+        """
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        scores = self.forward_attention(v, scores, key_padding_mask)
+        scores = scores.transpose(0, 1)
+        return scores, None
+
+
+class RelPositionMultiHeadedAttention(ESPNETMultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head: The number of heads.
+        n_feat: The number of features.
+        dropout: Dropout rate.
+        zero_triu: Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_feat, n_head, dropout, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_feat, n_head, dropout)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.zeros(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.zeros(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x: Input tensor B X n_head X T X 2T-1
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)), device=x.device)
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, key_padding_mask=None, **kwargs):
+        """Compute scaled dot product attention.
+        Args:
+            query: Query tensor T X B X C
+            key: Key tensor T X B X C
+            value: Value tensor T X B X C
+            pos_emb: Positional embedding tensor B X 2T-1 X C
+            key_padding_mask: Mask tensor T X B
+        Returns:
+            torch.Tensor: Output tensor T X B X C.
+        """
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+        pos_emb = pos_emb.transpose(0, 1)
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k
+        )  # (batch, head, time1, time2)
+
+        scores = self.forward_attention(v, scores, key_padding_mask)
+        scores = scores.transpose(0, 1)
+        return scores, None
+
+
+class RotaryPositionMultiHeadedAttention(ESPNETMultiHeadedAttention):
+    def __init__(
+        self,
+        n_feat,
+        n_head,
+        dropout,
+        precision,
+        rotary_emd_base=10000,
+    ):
+        """Construct an RotaryPositionMultiHeadedAttention object."""
+        super().__init__(n_feat, n_head, dropout)
+        precision = torch.float
+        self.rotary_ndims = self.d_k  # also try self.d_k//2
+        if precision == "fp16":
+            precision = torch.half
+
+        self.rotary_emb = RotaryPositionalEmbedding(
+            self.rotary_ndims, base=rotary_emd_base, precision=precision
+        )
+
+    def forward(self, query, key, value, key_padding_mask=None, **kwargs):
+        """Compute rotary position attention.
+        Args:
+            query: Query tensor T X B X C
+            key: Key tensor T X B X C
+            value: Value tensor T X B X C
+            key_padding_mask: Mask tensor T X B
+        Returns:
+            torch.Tensor: Output tensor T X B X D.
+        Notes:
+            Assumes self attn
+        """
+
+        T, B, C = value.size()
+        query = query.view(T, B, self.h, self.d_k)
+        key = key.view(T, B, self.h, self.d_k)
+        value = value.view(T, B, self.h, self.d_k)
+        cos, sin = self.rotary_emb(value, seq_len=T)
+        query, key = apply_rotary_pos_emb(
+            query, key, cos, sin, offset=0
+        )  # offset is based on layer_past
+
+        query = query.view(T, B, self.h * self.d_k)
+        key = key.view(T, B, self.h * self.d_k)
+        value = value.view(T, B, self.h * self.d_k)
+
+        # TBD to BTD
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        scores = self.forward_attention(v, scores, key_padding_mask)
+        scores = scores.transpose(0, 1)
+        return scores, None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
new file mode 100644
index 0000000000..3cddca7718
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+logger = logging.getLogger(__name__)
+
+
+class FairseqDropout(nn.Module):
+    def __init__(self, p, module_name=None):
+        super().__init__()
+        self.p = p
+        self.module_name = module_name
+        self.apply_during_inference = False
+
+    def forward(self, x, inplace: bool = False):
+        if self.p > 0 and (self.training or self.apply_during_inference):
+            return F.dropout(x, p=self.p, training=True, inplace=inplace)
+        else:
+            return x
+
+    def make_generation_fast_(
+        self,
+        name: str,
+        retain_dropout: bool = False,
+        retain_dropout_modules: Optional[List[str]] = None,
+        **kwargs
+    ):
+        if retain_dropout:
+            if retain_dropout_modules is not None and self.module_name is None:
+                logger.warning(
+                    "Cannot enable dropout during inference for module {} "
+                    "because module_name was not set".format(name)
+                )
+            elif (
+                retain_dropout_modules is None  # if None, apply to all modules
+                or self.module_name in retain_dropout_modules
+            ):
+                logger.info(
+                    "Enabling dropout during inference for module: {}".format(name)
+                )
+                self.apply_during_inference = True
+            else:
+                logger.info("Disabling dropout for module: {}".format(name))
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
new file mode 100644
index 0000000000..c560f338fd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+batch norm done in fp32 (for fp16 training)
+"""
+import torch
+import torch.nn as nn
+
+
+class Fp32BatchNorm(nn.Module):
+    def __init__(self, sync=False, *args, **kwargs):
+        super().__init__()
+
+        if sync:
+            from fairseq.distributed import utils
+
+            if utils.get_global_world_size() == 1:
+                sync = False
+
+        if sync:
+            self.bn = nn.SyncBatchNorm(*args, **kwargs)
+        else:
+            self.bn = nn.BatchNorm1d(*args, **kwargs)
+
+        self.sync = sync
+
+    def forward(self, input):
+        if self.bn.running_mean.dtype != torch.float:
+            if self.sync:
+                self.bn.running_mean = self.bn.running_mean.float()
+                self.bn.running_var = self.bn.running_var.float()
+                if self.bn.affine:
+                    try:
+                        self.bn.weight = self.bn.weight.float()
+                        self.bn.bias = self.bn.bias.float()
+                    except:
+                        self.bn.float()
+            else:
+                self.bn.float()
+
+        output = self.bn(input.float())
+        return output.type_as(input)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
new file mode 100644
index 0000000000..d03aac022e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Layer norm done in fp32 (for fp16 training)
+"""
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Fp32GroupNorm(nn.GroupNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.group_norm(
+            input.float(),
+            self.num_groups,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
new file mode 100644
index 0000000000..30a54496de
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Layer norm done in fp32 (for fp16 training)
+"""
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Fp32InstanceNorm(nn.InstanceNorm1d):
+    def __init__(self, *args, **kwargs):
+        self.transpose_last = "transpose_last" in kwargs and kwargs["transpose_last"]
+        if "transpose_last" in kwargs:
+            del kwargs["transpose_last"]
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        if self.transpose_last:
+            input = input.transpose(1, 2)
+        output = F.instance_norm(
+            input.float(),
+            running_mean=self.running_mean,
+            running_var=self.running_var,
+            weight=self.weight.float() if self.weight is not None else None,
+            bias=self.bias.float() if self.bias is not None else None,
+            use_input_stats=self.training or not self.track_running_stats,
+            momentum=self.momentum,
+            eps=self.eps,
+        )
+        if self.transpose_last:
+            output = output.transpose(1, 2)
+        return output.type_as(input)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
new file mode 100644
index 0000000000..a2f1ecff4a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
+the corresponding GitHub repo: https://github.com/hendrycks/GELUs
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+
+
+def gelu_accurate(x):
+    if not hasattr(gelu_accurate, "_a"):
+        gelu_accurate._a = math.sqrt(2 / math.pi)
+    return (
+        0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
+    )
+
+
+def gelu(x: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.gelu(x.float()).type_as(x)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
new file mode 100644
index 0000000000..08d15f55df
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
new file mode 100644
index 0000000000..867b019f67
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
@@ -0,0 +1,212 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GumbelVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_vars,
+        temp,
+        groups,
+        combine_groups,
+        vq_dim,
+        time_first,
+        activation=nn.GELU(),
+        weight_proj_depth=1,
+        weight_proj_factor=1,
+        hard=True,
+        std=0,
+    ):
+        """Vector quantization using gumbel softmax
+
+        Args:
+            dim: input dimension (channels)
+            num_vars: number of quantized vectors per group
+            temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor)
+            groups: number of groups for vector quantization
+            combine_groups: whether to use the vectors for all groups
+            vq_dim: dimensionality of the resulting quantized vector
+            time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+            activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1
+            weight_proj_depth: number of layers (with activation in between) to project input before computing logits
+            weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of
+                                projections by this factor
+        """
+        super().__init__()
+
+        self.groups = groups
+        self.combine_groups = combine_groups
+        self.input_dim = dim
+        self.num_vars = num_vars
+        self.time_first = time_first
+        self.hard = hard
+
+        assert (
+            vq_dim % groups == 0
+        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+        var_dim = vq_dim // groups
+        num_groups = groups if not combine_groups else 1
+
+        self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim))
+        if std == 0:
+            nn.init.uniform_(self.vars)
+        else:
+            nn.init.normal_(self.vars, mean=0, std=std)
+
+        if weight_proj_depth > 1:
+
+            def block(input_dim, output_dim):
+                return nn.Sequential(nn.Linear(input_dim, output_dim), activation)
+
+            inner_dim = self.input_dim * weight_proj_factor
+            self.weight_proj = nn.Sequential(
+                *[
+                    block(self.input_dim if i == 0 else inner_dim, inner_dim)
+                    for i in range(weight_proj_depth - 1)
+                ],
+                nn.Linear(inner_dim, groups * num_vars),
+            )
+        else:
+            self.weight_proj = nn.Linear(self.input_dim, groups * num_vars)
+            nn.init.normal_(self.weight_proj.weight, mean=0, std=1)
+            nn.init.zeros_(self.weight_proj.bias)
+
+        if isinstance(temp, str):
+            import ast
+
+            temp = ast.literal_eval(temp)
+        assert len(temp) == 3, f"{temp}, {len(temp)}"
+
+        self.max_temp, self.min_temp, self.temp_decay = temp
+        self.curr_temp = self.max_temp
+        self.codebook_indices = None
+
+    def set_num_updates(self, num_updates):
+        self.curr_temp = max(
+            self.max_temp * self.temp_decay**num_updates, self.min_temp
+        )
+
+    def get_codebook_indices(self):
+        if self.codebook_indices is None:
+            from itertools import product
+
+            p = [range(self.num_vars)] * self.groups
+            inds = list(product(*p))
+            self.codebook_indices = torch.tensor(
+                inds, dtype=torch.long, device=self.vars.device
+            ).flatten()
+
+            if not self.combine_groups:
+                self.codebook_indices = self.codebook_indices.view(
+                    self.num_vars**self.groups, -1
+                )
+                for b in range(1, self.groups):
+                    self.codebook_indices[:, b] += self.num_vars * b
+                self.codebook_indices = self.codebook_indices.flatten()
+        return self.codebook_indices
+
+    def codebook(self):
+        indices = self.get_codebook_indices()
+        return (
+            self.vars.squeeze(0)
+            .index_select(0, indices)
+            .view(self.num_vars**self.groups, -1)
+        )
+
+    def sample_from_codebook(self, b, n):
+        indices = self.get_codebook_indices()
+        indices = indices.view(-1, self.groups)
+        cb_size = indices.size(0)
+        assert (
+            n < cb_size
+        ), f"sample size {n} is greater than size of codebook {cb_size}"
+        sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,))
+        indices = indices[sample_idx]
+
+        z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1)
+        return z
+
+    def to_codebook_index(self, indices):
+        res = indices.new_full(indices.shape[:-1], 0)
+        for i in range(self.groups):
+            exponent = self.groups - i - 1
+            res += indices[..., i] * (self.num_vars**exponent)
+        return res
+
+    def forward_idx(self, x):
+        res = self.forward(x, produce_targets=True)
+        return res["x"], res["targets"]
+
+    def forward(self, x, produce_targets=False):
+
+        result = {"num_vars": self.num_vars * self.groups}
+
+        if not self.time_first:
+            x = x.transpose(1, 2)
+
+        bsz, tsz, fsz = x.shape
+        x = x.reshape(-1, fsz)
+        x = self.weight_proj(x)
+        x = x.view(bsz * tsz * self.groups, -1)
+
+        with torch.no_grad():
+            _, k = x.max(-1)
+            hard_x = (
+                x.new_zeros(*x.shape)
+                .scatter_(-1, k.view(-1, 1), 1.0)
+                .view(bsz * tsz, self.groups, -1)
+            )
+            hard_probs = torch.mean(hard_x.float(), dim=0)
+            result["code_perplexity"] = torch.exp(
+                -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+            ).sum()
+
+        avg_probs = torch.softmax(
+            x.view(bsz * tsz, self.groups, -1).float(), dim=-1
+        ).mean(dim=0)
+        result["prob_perplexity"] = torch.exp(
+            -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
+        ).sum()
+
+        result["temp"] = self.curr_temp
+
+        if self.training:
+            x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as(
+                x
+            )
+        else:
+            x = hard_x
+
+        x = x.view(bsz * tsz, -1)
+
+        vars = self.vars
+        if self.combine_groups:
+            vars = vars.repeat(1, self.groups, 1)
+
+        if produce_targets:
+            result["targets"] = (
+                x.view(bsz * tsz * self.groups, -1)
+                .argmax(dim=-1)
+                .view(bsz, tsz, self.groups)
+                .detach()
+            )
+
+        x = x.unsqueeze(-1) * vars
+        x = x.view(bsz * tsz, self.groups, self.num_vars, -1)
+        x = x.sum(-2)
+        x = x.view(bsz, tsz, -1)
+
+        if not self.time_first:
+            x = x.transpose(1, 2)  # BTC -> BCT
+
+        result["x"] = x
+
+        return result
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py
new file mode 100644
index 0000000000..0088d1ebdc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py
@@ -0,0 +1,744 @@
+import math
+from functools import reduce, wraps
+from inspect import isfunction
+from operator import mul
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from aml.multimodal_video.utils.einops.lib import rearrange, repeat
+from aml.multimodal_video.utils.einops.lib.layers.torch import Rearrange
+
+from fairseq.modules.local_attention import LocalAttention
+
+# constants
+
+TOKEN_SELF_ATTN_VALUE = -5e4
+KMEAN_INIT_ITERS = 10
+
+# helper functions
+
+
+def exists(val):
+    return val is not None
+
+
+def identity(x, *args, **kwargs):
+    return x
+
+
+def default(x, d):
+    if not exists(x):
+        return d if not isfunction(d) else d()
+    return x
+
+
+def cast_tuple(x):
+    return x if isinstance(x, tuple) else (x,)
+
+
+def cache_fn(f):
+    cache = None
+
+    @wraps(f)
+    def cached_fn(*args, **kwargs):
+        nonlocal cache
+        if exists(cache):
+            return cache
+        cache = f(*args, **kwargs)
+        return cache
+
+    return cached_fn
+
+
+def to(t):
+    return {"device": t.device, "dtype": t.dtype}
+
+
+def find_modules(nn_module, type):
+    return [module for module in nn_module.modules() if isinstance(module, type)]
+
+
+def is_empty(t):
+    return t.nelement() == 0
+
+
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+
+
+def batched_index_select(values, indices):
+    last_dim = values.shape[-1]
+    return values.gather(2, expand_dim(indices, -1, last_dim))
+
+
+def merge_dims(ind_from, ind_to, tensor):
+    shape = list(tensor.shape)
+    arr_slice = slice(ind_from, ind_to + 1)
+    shape[arr_slice] = [reduce(mul, shape[arr_slice])]
+    return tensor.reshape(*shape)
+
+
+def expand_dim(t, dim, k):
+    t = t.unsqueeze(dim)
+    expand_shape = [-1] * len(t.shape)
+    expand_shape[dim] = k
+    return t.expand(*expand_shape)
+
+
+def scatter_mean(src, t, index, dim, eps=1e-5):
+    numer = src.scatter_add(dim, index, t)
+    denom = src.scatter_add(dim, index, torch.ones_like(t))
+    return numer / (denom + eps)
+
+
+def split_at_index(dim, index, t):
+    pre_slices = (slice(None),) * dim
+    l = (*pre_slices, slice(None, index))
+    r = (*pre_slices, slice(index, None))
+    return t[l], t[r]
+
+
+def reshape_dim(t, dim, split_dims):
+    shape = list(t.shape)
+    num_dims = len(shape)
+    dim = (dim + num_dims) % num_dims
+    shape[dim : dim + 1] = split_dims
+    return t.reshape(shape)
+
+
+def ema(old, new, decay):
+    if not exists(old):
+        return new
+    return old * decay + new * (1 - decay)
+
+
+def ema_inplace(moving_avg, new, decay):
+    if is_empty(moving_avg):
+        moving_avg.data.copy_(new)
+        return
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+
+
+# helper classes
+
+
+def map_first_tuple_or_el(x, fn):
+    if isinstance(x, tuple):
+        return (fn(x[0]),) + x[1:]
+    return fn(x)
+
+
+class Chunk(nn.Module):
+    def __init__(self, chunks, fn, along_dim=-1):
+        super().__init__()
+        self.dim = along_dim
+        self.chunks = chunks
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        if self.chunks <= 1:
+            return self.fn(x, **kwargs)
+        chunks = x.chunk(self.chunks, dim=self.dim)
+        return torch.cat([self.fn(c, **kwargs) for c in chunks], dim=self.dim)
+
+
+class PreNorm(nn.ModuleList):
+    def __init__(self, norm_class, dim, fn):
+        super().__init__()
+        self.norm = norm_class(dim)
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, **kwargs)
+
+
+class ReZero(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.residual_weight = nn.Parameter(torch.zeros(1))
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        x = self.fn(x, **kwargs)
+        return map_first_tuple_or_el(x, lambda t: t * self.residual_weight)
+
+
+class ScaleNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(1))
+        self.eps = eps
+
+    def forward(self, x):
+        def norm(t):
+            n = torch.norm(t, dim=-1, keepdim=True).clamp(min=self.eps)
+            return t / n * self.g
+
+        return map_first_tuple_or_el(x, norm)
+
+
+class ProjectInOut(nn.Module):
+    def __init__(self, fn, dim_in, dim_out, project_out=True):
+        super().__init__()
+        self.fn = fn
+        self.project_in = nn.Linear(dim_in, dim_out)
+        self.project_out = nn.Linear(dim_out, dim_in) if project_out else identity
+
+    def forward(self, x, **kwargs):
+        x = self.project_in(x)
+        x, loss = self.fn(x, **kwargs)
+        x = self.project_out(x)
+        return x, loss
+
+
+class MatrixMultiply(nn.Module):
+    def __init__(self, tensor, transpose=False):
+        super().__init__()
+        self.tensor = tensor
+        self.transpose = transpose
+
+    def forward(self, x):
+        tensor = self.tensor
+        if self.transpose:
+            tensor = tensor.t()
+        return x @ tensor
+
+
+# positional embeddings
+
+
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, dim_in, dim_out, kernel_size, stride=1, bias=True, causal=False):
+        super().__init__()
+        self.padding = (
+            ((kernel_size - 1), 0) if causal else (kernel_size // 2, kernel_size // 2)
+        )
+
+        self.net = nn.Sequential(
+            nn.Conv1d(
+                dim_in,
+                dim_in,
+                kernel_size=kernel_size,
+                groups=dim_in,
+                stride=stride,
+                bias=bias,
+            ),
+            nn.Conv1d(dim_in, dim_out, 1, bias=bias),
+        )
+
+    def forward(self, x):
+        x = F.pad(x, self.padding, value=0.0)
+        return self.net(x)
+
+
+class FixedPositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        position = torch.arange(0, max_seq_len, dtype=torch.float)
+        sinusoid_inp = torch.einsum("i,j->ij", position, inv_freq)
+        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
+        self.register_buffer("emb", emb)
+
+    def forward(self, x):
+        return self.emb[None, : x.shape[1], :].to(x)
+
+
+def rotate_every_two(x):
+    x = rearrange(x, "... (d j) -> ... d j", j=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d j -> ... (d j)")
+
+
+def apply_rotary_pos_emb(q, k, sinu_pos):
+    sinu_pos = rearrange(sinu_pos, "() n (j d) -> n j d", j=2)
+    sin, cos = sinu_pos.unbind(dim=-2)
+    sin, cos = map(lambda t: repeat(t, "b n -> b (n j)", j=2), (sin, cos))
+    q, k = map(lambda t: (t * cos) + (rotate_every_two(t) * sin), (q, k))
+    return q, k
+
+
+# kmeans related function and class
+
+
+def update_kmeans_on_backwards(module):
+    module.kmean_modules = find_modules(module, Kmeans)
+
+    def hook(_, grad_in, grad_out):
+        for m in module.kmean_modules:
+            m.update()
+
+    return module.register_backward_hook(hook)
+
+
+def similarity(x, means):
+    return torch.einsum("bhld,hcd->bhlc", x, means)
+
+
+def dists_and_buckets(x, means):
+    dists = similarity(x, means)
+    _, buckets = torch.max(dists, dim=-1)
+    return dists, buckets
+
+
+def batched_bincount(index, num_classes, dim=-1):
+    shape = list(index.shape)
+    shape[dim] = num_classes
+    out = index.new_zeros(shape)
+    out.scatter_add_(dim, index, torch.ones_like(index, dtype=index.dtype))
+    return out
+
+
+def kmeans_iter(x, means, buckets=None):
+    b, h, _, d, dtype, num_clusters = *x.shape, x.dtype, means.shape[1]
+
+    if not exists(buckets):
+        _, buckets = dists_and_buckets(x, means)
+
+    bins = batched_bincount(buckets, num_clusters).sum(0, keepdim=True)
+    zero_mask = bins.long() == 0
+
+    means_ = buckets.new_zeros(b, h, num_clusters, d, dtype=dtype)
+    means_.scatter_add_(-2, expand_dim(buckets, -1, d), x)
+    means_ = F.normalize(means_.sum(0, keepdim=True), dim=-1).type(dtype)
+
+    means = torch.where(zero_mask.unsqueeze(-1), means, means_)
+    means = means.squeeze(0)
+    return means
+
+
+def distribution(dists, window_size):
+    _, topk_indices = dists.topk(k=window_size, dim=-2)
+    indices = topk_indices.transpose(-2, -1)
+    return indices.reshape(*indices.size()[:2], -1)
+
+
+class Kmeans(nn.Module):
+    def __init__(
+        self, num_heads, head_dim, num_clusters, ema_decay=0.999, commitment=1e-4
+    ):
+        super().__init__()
+        self.commitment = commitment
+        self.ema_decay = ema_decay
+
+        self.register_buffer("means", torch.randn(num_heads, num_clusters, head_dim))
+        self.register_buffer("initted", torch.tensor(False))
+        self.num_new_means = 0
+        self.new_means = None
+
+    @torch.no_grad()
+    def init(self, x):
+        if self.initted:
+            return
+        _, h, _, d, device, _ = *x.shape, x.device, x.dtype
+
+        num_clusters = self.means.shape[1]
+
+        means = x.transpose(0, 1).contiguous().view(h, -1, d)
+        num_samples = means.shape[1]
+
+        if num_samples >= num_clusters:
+            indices = torch.randperm(num_samples, device=device)[:num_clusters]
+        else:
+            indices = torch.randint(0, num_samples, (num_clusters,), device=device)
+
+        means = means[:, indices]
+
+        for _ in range(KMEAN_INIT_ITERS):
+            means = kmeans_iter(x, means)
+
+        self.num_new_means = 0
+        self.means.data.copy_(means)
+        self.initted.data.copy_(torch.tensor(True))
+
+    @torch.no_grad()
+    def update(self, new_means=None):
+        new_means = default(new_means, self.new_means)
+        assert exists(new_means), "new kmeans has not been supplied"
+        ema_inplace(self.means, new_means, self.ema_decay)
+
+        del self.new_means
+        self.new_means = None
+        self.num_new_means = 0
+
+    def forward(self, x, update_means=False):
+        self.init(x)
+
+        b, dtype = x.shape[0], x.dtype
+        means = self.means.type(dtype)
+        x = F.normalize(x, 2, dim=-1).type(dtype)
+
+        with torch.no_grad():
+            dists, buckets = dists_and_buckets(x, means)
+
+        routed_means = batched_index_select(expand_dim(means, 0, b), buckets)
+        loss = F.mse_loss(x, routed_means) * self.commitment
+
+        if update_means:
+            with torch.no_grad():
+                means = kmeans_iter(x, means, buckets)
+            self.new_means = ema(
+                self.new_means, means, self.num_new_means / (self.num_new_means + 1)
+            )
+            self.num_new_means += 1
+
+        return dists, loss
+
+
+# kmeans attention class
+
+
+class KmeansAttention(nn.Module):
+    def __init__(
+        self,
+        num_clusters,
+        window_size,
+        num_heads,
+        head_dim,
+        causal=False,
+        dropout=0.0,
+        ema_decay=0.999,
+        commitment=1e-4,
+        context_window_size=None,
+        receives_context=False,
+        num_mem_kv=0,
+        shared_qk=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_clusters = num_clusters
+        self.head_dim = head_dim
+
+        self.window_size = window_size
+        self.context_window_size = default(context_window_size, window_size)
+        self.causal = causal
+
+        self.shared_qk = shared_qk
+        self.receives_context = receives_context
+        self.kmeans = Kmeans(num_heads, head_dim, num_clusters, ema_decay, commitment)
+        self.dropout = nn.Dropout(dropout)
+
+        self.num_mem_kv = max(num_mem_kv, 1 if causal and not shared_qk else 0)
+        self.mem_key = nn.Parameter(
+            torch.randn(num_heads, num_clusters, self.num_mem_kv, head_dim)
+        )
+        self.mem_value = nn.Parameter(
+            torch.randn(num_heads, num_clusters, self.num_mem_kv, head_dim)
+        )
+
+    def forward(self, q, k, v, query_mask=None, key_mask=None, **kwargs):
+        b, h, t, d, kv_t, wsz, c_wsz, nc, device, dtype = (
+            *q.shape,
+            k.shape[2],
+            self.window_size,
+            self.context_window_size,
+            self.num_clusters,
+            q.device,
+            q.dtype,
+        )
+        is_reverse = kwargs.pop("_reverse", False)
+
+        out = torch.zeros_like(q, dtype=dtype)
+
+        update_kmeans = self.training and not is_reverse
+
+        key_mask = (
+            default(key_mask, query_mask) if not self.receives_context else key_mask
+        )
+        kv_wsz = wsz if not self.receives_context else c_wsz
+
+        wsz = min(wsz, t)
+        kv_wsz = min(kv_wsz, kv_t)
+
+        if not self.shared_qk or self.receives_context:
+            dists, aux_loss = self.kmeans(torch.cat((q, k), dim=2), update_kmeans)
+            q_dists, k_dists = split_at_index(2, t, dists)
+            indices = distribution(q_dists, wsz)
+            kv_indices = distribution(k_dists, kv_wsz)
+        else:
+            dists, aux_loss = self.kmeans(q, update_kmeans)
+            k = F.normalize(k, dim=-1).to(q)
+            indices = distribution(dists, wsz)
+            kv_indices = indices
+
+        q = batched_index_select(q, indices)
+        k = batched_index_select(k, kv_indices)
+        v = batched_index_select(v, kv_indices)
+
+        reshape_with_window = lambda x: x.reshape(b, h, nc, -1, d)
+        q, k, v = map(reshape_with_window, (q, k, v))
+
+        m_k, m_v = map(
+            lambda x: expand_dim(x, 0, b).to(q), (self.mem_key, self.mem_value)
+        )
+        k, v = map(lambda x: torch.cat(x, dim=3), ((m_k, k), (m_v, v)))
+
+        dots = torch.einsum("bhnid,bhnjd->bhnij", q, k) * (d**-0.5)
+
+        mask_value = max_neg_value(dots)
+
+        if exists(query_mask) or exists(key_mask):
+            query_mask = default(
+                query_mask, lambda: torch.ones((b, t), device=device).bool()
+            )
+            key_mask = default(
+                key_mask, lambda: torch.ones((b, kv_t), device=device).bool()
+            )
+
+            q_mask = expand_dim(query_mask, 1, h).gather(2, indices)
+            kv_mask = expand_dim(key_mask, 1, h).gather(2, kv_indices)
+            q_mask, kv_mask = map(lambda t: t.reshape(b, h, nc, -1), (q_mask, kv_mask))
+            mask = q_mask[:, :, :, :, None] * kv_mask[:, :, :, None, :]
+            mask = F.pad(mask, (self.num_mem_kv, 0), value=1)
+            dots.masked_fill_(~mask, mask_value)
+            del mask
+
+        if self.causal:
+            q_mask, kv_mask = map(
+                lambda t: t.reshape(b, h, nc, -1), (indices, kv_indices)
+            )
+            mask = q_mask[:, :, :, :, None] >= kv_mask[:, :, :, None, :]
+            mask = F.pad(mask, (self.num_mem_kv, 0), value=1)
+            dots.masked_fill_(~mask, mask_value)
+            del mask
+
+        if self.shared_qk:
+            q_mask, kv_mask = map(
+                lambda t: t.reshape(b, h, nc, -1), (indices, kv_indices)
+            )
+            mask = q_mask[:, :, :, :, None] == kv_mask[:, :, :, None, :]
+            mask = F.pad(mask, (self.num_mem_kv, 0), value=0)
+            dots.masked_fill_(mask, TOKEN_SELF_ATTN_VALUE)
+            del mask
+
+        dots = dots.softmax(dim=-1)
+        dots = self.dropout(dots)
+
+        bo = torch.einsum("bhcij,bhcjd->bhcid", dots, v)
+        so = torch.reshape(bo, (b, h, -1, bo.shape[-1])).type(dtype)
+        out = scatter_mean(out, so, indices.unsqueeze(-1).expand_as(so), -2)
+        return out, aux_loss
+
+
+# feedforward
+
+
+class GELU_(nn.Module):
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1
+                + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))
+            )
+        )
+
+
+GELU = nn.GELU if hasattr(nn, "GELU") else GELU_
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, mult=4, dropout=0.0, activation=None, glu=False):
+        super().__init__()
+        activation = default(activation, GELU)
+
+        self.glu = glu
+        self.w1 = nn.Linear(dim, dim * mult * (2 if glu else 1))
+        self.act = activation()
+        self.dropout = nn.Dropout(dropout)
+        self.w2 = nn.Linear(dim * mult, dim)
+
+    def forward(self, x, **kwargs):
+        if not self.glu:
+            x = self.w1(x)
+            x = self.act(x)
+        else:
+            x, v = self.w1(x).chunk(2, dim=-1)
+            x = self.act(x) * v
+
+        x = self.dropout(x)
+        x = self.w2(x)
+        return x
+
+
+# self attention
+
+
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        max_seq_len,
+        heads,
+        local_attn_heads,
+        window_size,
+        dim_head=None,
+        local_attn_window_size=None,
+        local_attn_radius_blocks=1,
+        causal=False,
+        attn_dropout=0.0,
+        dropout=0.0,
+        kmeans_ema_decay=0.999,
+        commitment_factor=1e-4,
+        receives_context=False,
+        context_window_size=None,
+        rel_pos_emb=True,
+        num_mem_kv=0,
+        shared_qk=False,
+        conv_query_kernel=9,
+    ):
+        super().__init__()
+        assert (
+            dim_head or (dim % heads) == 0
+        ), "hidden dimension must be divisible by number of heads"
+        assert (
+            max_seq_len % window_size
+        ) == 0, "maximum sequence length must be divisible by the target window size"
+        assert (
+            local_attn_heads <= heads
+        ), "number of local attention heads must be less than total heads"
+        assert not (
+            receives_context and local_attn_heads > 0
+        ), "local attention cannot be used for self attention with context"
+        assert not (
+            receives_context and causal
+        ), "contextual attention layer cannot be causal"
+
+        local_attn_window_size = default(local_attn_window_size, window_size)
+        context_window_size = default(context_window_size, window_size)
+
+        self.shared_qk = shared_qk
+        self.receives_context = receives_context
+        self.heads = heads
+        self.local_attn_heads = local_attn_heads
+        self.global_attn_heads = heads - local_attn_heads
+
+        self.causal = causal
+        self.window_size = window_size
+
+        dim_head = default(dim_head, dim // heads)
+        dim_heads = dim_head * heads
+        self.dim_head = dim_head
+
+        num_clusters = max_seq_len // window_size
+
+        # local
+
+        local_dim_heads = dim_head * self.local_attn_heads
+
+        if self.local_attn_heads > 0:
+            rel_pos_emb_config = (dim_head, local_attn_heads) if rel_pos_emb else None
+            self.local_attn = LocalAttention(
+                dim=dim_head,
+                window_size=local_attn_window_size,
+                causal=causal,
+                dropout=attn_dropout,
+                rel_pos_emb_config=rel_pos_emb_config,
+                look_backward=local_attn_radius_blocks,
+                look_forward=0 if causal else local_attn_radius_blocks,
+            )
+            self.local_to_qkv = nn.Linear(dim, 3 * local_dim_heads)
+
+        # global
+
+        global_dim_heads = dim_head * self.global_attn_heads
+
+        if self.global_attn_heads > 0:
+            self.global_attn = KmeansAttention(
+                num_clusters,
+                window_size,
+                self.global_attn_heads,
+                dim_head,
+                causal=causal,
+                dropout=attn_dropout,
+                ema_decay=kmeans_ema_decay,
+                commitment=commitment_factor,
+                receives_context=receives_context,
+                num_mem_kv=num_mem_kv,
+                shared_qk=shared_qk,
+            )
+
+        self.to_q = nn.Sequential(
+            Rearrange("b n c -> b c n"),
+            DepthWiseConv1d(dim, global_dim_heads, conv_query_kernel, causal=causal),
+            Rearrange("b c n -> b n c"),
+        )
+
+        self.to_v = nn.Linear(dim, global_dim_heads, bias=False)
+
+        if not self.shared_qk:
+            self.to_k = nn.Linear(dim, global_dim_heads, bias=False)
+
+        # out
+
+        self.to_out = nn.Linear(dim_heads, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        context=None,
+        key_padding_mask=None,
+        context_mask=None,
+        pos_emb=None,
+        **kwargs
+    ):
+        assert not (
+            self.receives_context and not exists(context)
+        ), "context must be passed if self attention is set to receive context"
+        input_mask = key_padding_mask
+        x = query.transpose(0, 1)
+        b, t, _, h, dh = *x.shape, self.heads, self.dim_head
+        has_local, has_global = map(
+            lambda x: x > 0, (self.local_attn_heads, self.global_attn_heads)
+        )
+
+        split_heads = (
+            lambda v: reshape_dim(v, -1, (-1, dh)).transpose(1, 2).contiguous()
+        )
+
+        if has_local:
+            local_qkv = self.local_to_qkv(x).chunk(3, dim=-1)
+            lq, lk, lv = map(split_heads, local_qkv)
+
+        if has_global:
+            kv_input = x if not self.receives_context else context
+
+            q, v = self.to_q(x), self.to_v(kv_input)
+
+            if not self.shared_qk:
+                k = self.to_k(kv_input)
+            else:
+                k = self.to_q(kv_input) if self.receives_context else q
+
+            q, k, v = map(split_heads, (q, k, v))
+
+        out = []
+        total_loss = torch.tensor(0.0, requires_grad=True, **to(x))
+
+        if has_local:
+            local_out = self.local_attn(lq, lk, lv, input_mask=input_mask)
+            out.append(local_out)
+
+        if has_global:
+            if not self.receives_context and exists(pos_emb):
+                q, k = apply_rotary_pos_emb(q, k, pos_emb)
+
+            global_out, loss = self.global_attn(
+                q, k, v, query_mask=input_mask, key_mask=context_mask
+            )
+            total_loss = total_loss + loss
+
+            out.append(global_out)
+
+        out = torch.cat(out, dim=1)
+        out = out.reshape(b, h, t, -1).transpose(1, 2).reshape(b, t, -1)
+        out = self.dropout(out.transpose(0, 1))
+        # out = self.to_out(out)
+        return out, total_loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
new file mode 100644
index 0000000000..1015c38999
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from fairseq.modules import Fp32GroupNorm
+
+
+class KmeansVectorQuantizer(nn.Module):
+    def __init__(
+        self, dim, num_vars, groups, combine_groups, vq_dim, time_first, gamma=0.25
+    ):
+        """Vector quantization using straight pass-through estimator (i.e. kmeans)
+
+        Args:
+            dim: input dimension (channels)
+            num_vars: number of quantized vectors per group
+            groups: number of groups for vector quantization
+            combine_groups: whether to use the vectors for all groups
+            vq_dim: dimensionality of the resulting quantized vector
+            time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+            gamma: commitment loss coefficient
+        """
+        super().__init__()
+
+        self.groups = groups
+        self.combine_groups = combine_groups
+        self.input_dim = dim
+        self.num_vars = num_vars
+        self.vq_dim = vq_dim
+        self.time_first = time_first
+
+        assert (
+            vq_dim % groups == 0
+        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+        self.var_dim = vq_dim // groups
+        num_groups = groups if not combine_groups else 1
+
+        self.embedding = nn.Parameter(
+            0.01 * torch.randn(num_vars, num_groups, self.var_dim)
+        )
+        self.projection = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size=1, groups=groups, bias=False),
+            Fp32GroupNorm(groups, dim),
+        )
+        self.gamma = gamma
+        self.mse_mean = nn.MSELoss(reduction="mean")
+
+    def _pass_grad(self, x, y):
+        """Manually set gradient for backward pass.
+        for y = f(x), ensure that during the backward pass,
+        dL/dy = dL/dx regardless of f(x).
+        Returns:
+            y, with the gradient forced to be dL/dy = dL/dx.
+        """
+
+        return y.detach() + (x - x.detach())
+
+    @property
+    def expand_embedding(self):
+        if self.combine_groups:
+            return self.embedding.expand(self.num_vars, self.groups, self.var_dim)
+        return self.embedding
+
+    def forward_idx(self, x):
+        res = self.forward(x, produce_targets=True)
+        return res["x"], res["targets"]
+
+    def forward(self, x, produce_targets=False):
+
+        result = {"num_vars": self.num_vars}
+
+        if self.time_first:
+            x = x.transpose(1, 2)
+
+        bsz, fsz, tsz = x.shape
+
+        ze = self.projection(x)
+        ze_ = ze.view(bsz, self.groups, self.var_dim, tsz).permute(0, 3, 1, 2)
+        d = (
+            (ze_.unsqueeze(0) - self.expand_embedding.unsqueeze(1).unsqueeze(1))
+            .view(self.num_vars, bsz, tsz, self.groups, -1)
+            .norm(dim=-1, p=2)
+        )
+        idx = d.argmin(dim=0)
+        zq = (
+            torch.stack(
+                [
+                    self.expand_embedding[idx[..., group], group]
+                    for group in range(self.groups)
+                ],
+                dim=-2,
+            )
+            .view(bsz, tsz, self.groups * self.var_dim)
+            .permute(0, 2, 1)
+        )
+        assert ze.shape == zq.shape, (ze.shape, zq.shape)
+        x = self._pass_grad(ze, zq)
+
+        with torch.no_grad():
+            hard_x = (
+                idx.new_zeros(bsz * tsz * self.groups, self.num_vars)
+                .scatter_(-1, idx.view(-1, 1), 1.0)
+                .view(bsz * tsz, self.groups, -1)
+            )
+            hard_probs = torch.mean(hard_x.float(), dim=0)
+            result["code_perplexity"] = torch.exp(
+                -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+            ).sum()
+
+        if produce_targets:
+            result["targets"] = idx
+
+        if self.time_first:
+            x = x.transpose(1, 2)  # BCT -> BTC
+        result["x"] = x
+
+        ze = ze.float()
+        zq = zq.float()
+        latent_loss = self.mse_mean(zq, ze.detach())
+        commitment_loss = self.mse_mean(ze, zq.detach())
+
+        result["kmeans_loss"] = latent_loss + self.gamma * commitment_loss
+
+        return result
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
new file mode 100644
index 0000000000..8961d8bcbc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+LayerDrop as described in https://arxiv.org/abs/1909.11556.
+"""
+
+import torch
+import torch.nn as nn
+
+
+class LayerDropModuleList(nn.ModuleList):
+    """
+    A LayerDrop implementation based on :class:`torch.nn.ModuleList`.
+
+    We refresh the choice of which layers to drop every time we iterate
+    over the LayerDropModuleList instance. During evaluation we always
+    iterate over all layers.
+
+    Usage::
+
+        layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
+        for layer in layers:  # this might iterate over layers 1 and 3
+            x = layer(x)
+        for layer in layers:  # this might iterate over all layers
+            x = layer(x)
+        for layer in layers:  # this might not iterate over any layers
+            x = layer(x)
+
+    Args:
+        p (float): probability of dropping out each layer
+        modules (iterable, optional): an iterable of modules to add
+    """
+
+    def __init__(self, p, modules=None):
+        super().__init__(modules)
+        self.p = p
+
+    def __iter__(self):
+        dropout_probs = torch.empty(len(self)).uniform_()
+        for i, m in enumerate(super().__iter__()):
+            if not self.training or (dropout_probs[i] > self.p):
+                yield m
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
new file mode 100644
index 0000000000..0b276ce02f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from apex.normalization import FusedLayerNorm as _FusedLayerNorm
+
+    has_fused_layernorm = True
+
+    class FusedLayerNorm(_FusedLayerNorm):
+        @torch.jit.unused
+        def forward(self, x):
+            if not x.is_cuda:
+                return super().forward(x)
+            else:
+                with torch.cuda.device(x.device):
+                    return super().forward(x)
+
+except ImportError:
+    has_fused_layernorm = False
+
+
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        export = True
+    if not export and torch.cuda.is_available() and has_fused_layernorm:
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class Fp32LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
new file mode 100644
index 0000000000..378d0f7071
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from torch import Tensor
+
+
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.onnx_trace = False
+        if self.padding_idx is not None:
+            self.max_positions = self.num_embeddings - self.padding_idx - 1
+        else:
+            self.max_positions = self.num_embeddings
+
+    def forward(
+        self,
+        input: Tensor,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        positions: Optional[Tensor] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        assert (positions is None) or (
+            self.padding_idx is None
+        ), "If positions is pre-computed then padding_idx should not be set."
+
+        if positions is None:
+            if incremental_state is not None:
+                # positions is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                positions = torch.zeros(
+                    (1, 1), device=input.device, dtype=input.dtype
+                ).fill_(int(self.padding_idx + input.size(1)))
+            else:
+                positions = utils.make_positions(
+                    input, self.padding_idx, onnx_trace=self.onnx_trace
+                )
+        return F.embedding(
+            positions,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
new file mode 100644
index 0000000000..3b2a99c122
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .lightconv_layer import LightconvLayer  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000..a25433dd8e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
@@ -0,0 +1,289 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def gen_forward():
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "lightconv_cuda.cuh"
+
+std::vector<at::Tensor> lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l) {
+
+    at::DeviceGuard g(input.device());
+    const auto minibatch = input.size(0);
+    const auto numFeatures = input.size(1);
+    const auto sequenceLength = input.size(2);
+
+    const auto numHeads = filters.size(0);
+    const auto filterSize = filters.size(1);
+
+    const auto numFiltersInBlock = numFeatures / numHeads;
+
+    const dim3 blocks(minibatch, numFeatures);
+
+    auto output = at::zeros_like(input);
+    auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+    sequence_if = """
+    if (sequenceLength <= {seq}) {{
+        switch(filterSize) {{
+"""
+
+    case_k = """
+            case {k}:
+"""
+
+    main_block = """
+                if (padding_l == {pad}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_forward", ([&] {{
+                        lightconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+                        <<<blocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                filters.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                output.data<scalar_t>());
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+                }
+                break;
+"""
+
+    bad_filter = """
+            default:
+                std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+        }
+"""
+
+    con_else = """
+    } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    final_return = """
+    }
+
+    return {output};
+}
+"""
+
+    with open("lightconv_cuda_forward.cu", "w") as forward:
+        forward.write(head)
+        for seq in seqs:
+            forward.write(sequence_if.format(seq=seq))
+            for k in kernels:
+                forward.write(case_k.format(k=k))
+                for pad in [k // 2, k - 1]:
+                    forward.write(main_block.format(k=k, b_size=seq, pad=pad))
+                forward.write(bad_padding)
+            forward.write(bad_filter)
+            forward.write(con_else)
+
+        forward.write(final_else)
+        for k in kernels:
+            forward.write(case_k.format(k=k))
+            for pad in [k // 2, k - 1]:
+                forward.write(main_block.format(k=k, b_size=seq, pad=pad))
+            forward.write(bad_padding)
+        forward.write(bad_filter)
+        forward.write(final_return)
+
+
+def gen_backward():
+
+    head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "lightconv_cuda.cuh"
+
+std::vector<at::Tensor> lightconv_cuda_backward(
+        at::Tensor gradOutput,
+        int padding_l,
+        at::Tensor input,
+        at::Tensor filters) {
+
+    // gradWrtInput
+    const int minibatch = input.size(0);
+    const int numFeatures = input.size(1);
+    const int sequenceLength = input.size(2);
+
+    const int numHeads = filters.size(0);
+    const int filterSize = filters.size(1);
+
+    const dim3 gradBlocks(minibatch, numFeatures);
+    const dim3 weightGradFirstpassShortBlocks(minibatch, numHeads);
+    const dim3 weightGradSecondpassBlocks(numHeads, filterSize);
+
+    const int numFiltersInBlock = numFeatures / numHeads;
+
+    auto gradInput = at::zeros_like(input);
+    auto gradFilters = at::zeros_like(filters);
+
+    at::DeviceGuard g(input.device());
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    switch(filterSize) {
+"""
+
+    sequence_if = """
+            if (sequenceLength <= {seq}) {{
+"""
+
+    case_k = """
+        case {k}:
+"""
+
+    main_block = """
+                if (padding_l == {p}) {{
+                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_backward", ([&] {{
+                        lightconv_grad_wrt_input_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<gradBlocks, {b_size}, 0, stream>>>(
+                                gradOutput.data<scalar_t>(),
+                                filters.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                gradInput.data<scalar_t>());
+
+"""
+
+    weight_grad_short = """
+                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numHeads, filterSize}}, input.options().dtype(at::kFloat));
+                        lightconv_grad_wrt_weights_firstpass_short_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<weightGradFirstpassShortBlocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                gradOutput.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                numHeads,
+                                tempSumGradFilters.data<float>()
+                        );
+
+                        lightconv_grad_wrt_weights_secondpass_short_kernel<{k}, {b_size}, scalar_t>
+                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
+                                tempSumGradFilters.data<float>(),
+                                minibatch,
+                                numFiltersInBlock,
+                                gradFilters.data<scalar_t>()
+                        );
+                    }}));
+                }} else
+"""
+
+    weight_grad = """
+                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numFeatures, filterSize}}, input.options().dtype(at::kFloat));
+                        lightconv_grad_wrt_weights_firstpass_kernel<{k}, {b_size}, {p}, scalar_t>
+                        <<<gradBlocks, {b_size}, 0, stream>>>(
+                                input.data<scalar_t>(),
+                                gradOutput.data<scalar_t>(),
+                                minibatch,
+                                sequenceLength,
+                                numFeatures,
+                                numFiltersInBlock,
+                                tempSumGradFilters.data<float>()
+                        );
+
+                        lightconv_grad_wrt_weights_secondpass_kernel<{k}, {b_size}, scalar_t>
+                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
+                                tempSumGradFilters.data<float>(),
+                                minibatch,
+                                numFiltersInBlock,
+                                gradFilters.data<scalar_t>()
+                        );
+                    }}));
+                }} else
+"""
+
+    bad_padding = """
+                {
+                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+                }
+"""
+
+    breakout = """
+                break;
+"""
+
+    bad_filter = """
+        default:
+            std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+"""
+
+    con_else = """
+            } else
+"""
+
+    final_else = """
+    {
+        switch(filterSize) {
+"""
+
+    last_return = """
+    }
+    return {gradInput, gradFilters};
+}
+"""
+
+    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+    thresh = [32, 32, 64, 128, 256, -1, -1, -1]
+    max_mem = [-1, -1, -1, -1, -1, 192, 96, 64]
+
+    with open("lightconv_cuda_backward.cu", "w") as backward:
+        backward.write(head)
+        for (k, t, mem) in zip(kernels, thresh, max_mem):
+            backward.write(case_k.format(k=k))
+            for seq in seqs:
+                if (t == -1 or seq <= t) and (mem == -1 or seq < mem):
+                    backward.write(sequence_if.format(seq=seq))
+                    for p in [k // 2, k - 1]:
+                        backward.write(main_block.format(k=k, b_size=seq, p=p))
+                        backward.write(weight_grad_short.format(k=k, b_size=seq, p=p))
+                    backward.write(bad_padding)
+                else:
+                    for p in [k // 2, k - 1]:
+                        backward.write(main_block.format(k=k, b_size=32, p=p))
+                        backward.write(weight_grad.format(k=k, b_size=32, p=p))
+                    backward.write(bad_padding)
+                    backward.write(breakout)
+                    break
+                backward.write(con_else)
+        backward.write(bad_filter)
+        backward.write(last_return)
+
+
+if __name__ == "__main__":
+    gen_forward()
+    gen_backward()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
new file mode 100644
index 0000000000..ece47a8d90
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor>
+lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l);
+
+std::vector<at::Tensor> lightconv_cuda_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters);
+
+#define CHECK_CUDA(x) \
+  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor>
+lightconv_forward(at::Tensor input, at::Tensor filters, int padding_l) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(filters);
+
+  return lightconv_cuda_forward(input, filters, padding_l);
+}
+
+std::vector<at::Tensor> lightconv_backward(
+    at::Tensor gradOutput,
+    int padding_l,
+    at::Tensor input,
+    at::Tensor filters) {
+  CHECK_INPUT(gradOutput);
+  CHECK_INPUT(input);
+  CHECK_INPUT(filters);
+
+  return lightconv_cuda_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &lightconv_forward, "lighconv forward (CUDA)");
+  m.def("backward", &lightconv_backward, "lighconv backward (CUDA)");
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
new file mode 100644
index 0000000000..610ab399e9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
@@ -0,0 +1,79 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <assert.h>
+#include <stdlib.h>
+
+#define SHFL_MASK 0xffffffff
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_forward_kernel(
+    const scalar_t* input,
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output);
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_grad_wrt_input_kernel(
+    const scalar_t* input,
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output);
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_firstpass_short_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    float* output);
+
+template <int FS, int SB, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_secondpass_short_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output);
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_firstpass_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    float* output);
+
+template <int FS, int SB, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_secondpass_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output);
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
new file mode 100644
index 0000000000..cdf31d5d2d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
@@ -0,0 +1,400 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../cuda_utils.cu"
+#include "lightconv_cuda.cuh"
+#include "lightconv_cuda_backward.cu"
+#include "lightconv_cuda_forward.cu"
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_forward_kernel(
+    const scalar_t* input,
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output) {
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+
+  const int IOOffset =
+      numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+  const scalar_t* inputFilter = &filters[filterIdx * FS];
+
+  assert(blockDim.x == SB);
+
+  scalar_t filter[FS];
+#pragma unroll
+  for (int i = 0; i < FS; ++i) {
+    filter[i] = inputFilter[i];
+  }
+
+  __shared__ scalar_t temp[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(temp);
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    // Read input into shared memory
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding_l>(
+        inputFeature,
+        inputOffset,
+        sequenceLength,
+        i,
+        numIterations,
+        (numIterations == 1),
+        temp);
+
+    __syncthreads();
+
+    scalar_t out = 0;
+#pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      out += filter[j] * temp[tid + j];
+    }
+
+    // Write output
+    const int outputOffset = inputOffset;
+    if ((outputOffset + tid) < sequenceLength) {
+      outputFeature[outputOffset + tid] = out;
+    }
+
+    __syncthreads();
+  }
+}
+
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_grad_wrt_input_kernel(
+    const scalar_t* input,
+    const scalar_t* filters,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    scalar_t* output) {
+  // input grad kernel is similar to forward kernel
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+
+  const int IOOffset =
+      numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  scalar_t* outputFeature = &output[IOOffset];
+  const scalar_t* inputFilter = &filters[filterIdx * FS];
+
+  assert(blockDim.x == SB);
+
+  scalar_t filter[FS];
+
+// The only change is loading the filter in reverse
+#pragma unroll
+  for (int i = 0; i < FS; ++i) {
+    filter[i] = inputFilter[FS - i - 1];
+  }
+
+  __shared__ scalar_t temp[SB + FS];
+  const int padding = FS - padding_l - 1;
+  zeroSharedMem<FS, SB, padding>(temp);
+
+  __syncthreads();
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  for (int i = 0; i < numIterations; ++i) {
+    // Read input into shared memory
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding>(
+        inputFeature,
+        inputOffset,
+        sequenceLength,
+        i,
+        numIterations,
+        false,
+        temp);
+
+    __syncthreads();
+
+    scalar_t out = 0;
+#pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      out += filter[j] * temp[tid + j];
+    }
+
+    // Write output
+    const int outputOffset = inputOffset;
+    if ((outputOffset + tid) < sequenceLength) {
+      outputFeature[outputOffset + tid] = out;
+    }
+
+    __syncthreads();
+  }
+}
+
+// This is by far the most expensive kernel in terms of time taken.
+// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_firstpass_short_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    int numHeads,
+    float* output) {
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int filterIdx = blockIdx.y;
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch];
+
+  assert(blockDim.x == SB);
+
+  __shared__ scalar_t tempInput[SB + FS];
+  __shared__ scalar_t tempGradInput[SB + FS];
+
+  // local weight accumulation
+  float accumWeights[FS];
+
+  // Initialize memory
+  for (int i = 0; i < FS; ++i) {
+    accumWeights[i] = float(0.0);
+  }
+
+  // loop over each sequence within filterblock
+  for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock;
+       ++idxInFilterBlock) {
+    const int featureOffset = batchIdx * numFeatures * sequenceLength +
+        (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength;
+    const scalar_t* inputFeature = &input[featureOffset];
+    const scalar_t* gradInputFeature = &gradInput[featureOffset];
+
+    zeroSharedMem<FS, SB, padding_l>(tempInput);
+    zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput);
+    __syncthreads();
+
+    for (int i = 0; i < numIterations; ++i) {
+      const int inputOffset = i * SB;
+
+      load_input_to_shared<FS, SB, padding_l>(
+          inputFeature,
+          inputOffset,
+          sequenceLength,
+          i,
+          numIterations,
+          false,
+          tempInput);
+      load_input_to_shared<FS, SB, (FS / 2)>(
+          gradInputFeature,
+          inputOffset,
+          sequenceLength,
+          i,
+          numIterations,
+          false,
+          tempGradInput);
+
+      __syncthreads();
+
+      const int gradIndex = (FS / 2) + tid;
+      scalar_t tempGrad = tempGradInput[gradIndex];
+
+#pragma unroll
+      for (int j = 0; j < FS; j++) {
+        const int inputIndex = tid + j;
+        accumWeights[j] += tempInput[inputIndex] * tempGrad;
+      }
+
+      __syncthreads();
+    }
+  }
+
+  // Row-major sum
+  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
+    float temp;
+    if (tid < sequenceLength) {
+      temp = accumWeights[filterWeightIdx];
+    } else {
+      temp = float(0.0);
+    }
+
+    const int outputOffset = filterWeightIdx * minibatch + batchIdx;
+
+    temp = blockReduce(temp);
+
+    if (tid == 0) {
+      tempOutputGradWeight[outputOffset] = temp;
+    }
+  }
+}
+
+template <int FS, int SB, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_secondpass_short_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output) {
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+
+  const int filterIdx = blockIdx.x;
+  const int filterWeightIdx = blockIdx.y;
+
+  const int inputOffset =
+      filterIdx * FS * minibatch + filterWeightIdx * minibatch;
+  const float* tempInput = &input[inputOffset];
+
+  // read into shared memory for reduction
+  int readIndex = tid;
+
+  float sum = 0.0;
+  while (readIndex < minibatch) {
+    sum += tempInput[readIndex];
+    readIndex += SB;
+  }
+
+  float temp = blockReduce(sum);
+
+  if (tid == 0) {
+    output[blockIdx.x * FS + blockIdx.y] = temp;
+  }
+}
+
+// This is by far the most expensive kernel in terms of time taken.
+// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
+template <int FS, int SB, int padding_l, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_firstpass_kernel(
+    const scalar_t* input,
+    const scalar_t* gradInput,
+    int minibatch,
+    int sequenceLength,
+    int numFeatures,
+    int numFiltersInBlock,
+    float* output) {
+  assert(blockDim.x == SB);
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int featureIdx = blockIdx.y;
+  const int filterIdx = featureIdx / numFiltersInBlock;
+  const int idxInFilterBlock = featureIdx % numFiltersInBlock;
+
+  const int numIterations = divUp<int, int>(sequenceLength, SB);
+
+  float temp;
+
+  __shared__ scalar_t tempInput[SB + FS];
+  __shared__ scalar_t tempGradInput[SB + FS];
+  zeroSharedMem<FS, SB, padding_l>(tempInput);
+  zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput);
+  __syncthreads();
+
+  float accumWeights[FS];
+
+  for (int i = 0; i < FS; ++i) {
+    accumWeights[i] = float(0.0);
+  }
+
+  const int IOOffset =
+      batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
+  const scalar_t* inputFeature = &input[IOOffset];
+  const scalar_t* gradInputFeature = &gradInput[IOOffset];
+  float* tempOutputGradWeight =
+      &output[filterIdx * FS * minibatch * numFiltersInBlock];
+
+  for (int i = 0; i < numIterations; ++i) {
+    const int inputOffset = i * SB;
+
+    load_input_to_shared<FS, SB, padding_l>(
+        inputFeature,
+        inputOffset,
+        sequenceLength,
+        i,
+        numIterations,
+        false,
+        tempInput);
+    load_input_to_shared<FS, SB, (FS / 2)>(
+        gradInputFeature,
+        inputOffset,
+        sequenceLength,
+        i,
+        numIterations,
+        false,
+        tempGradInput);
+    __syncthreads();
+
+#pragma unroll
+    for (int j = 0; j < FS; ++j) {
+      accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS / 2)];
+    }
+
+    __syncthreads();
+  }
+
+  // Row-major sum
+  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
+    // Write to shared memory before reduction
+    if (tid < sequenceLength) {
+      temp = accumWeights[filterWeightIdx];
+    } else {
+      temp = float(0.0);
+    }
+
+    temp = blockReduce(temp);
+
+    const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock +
+        batchIdx * numFiltersInBlock + idxInFilterBlock;
+
+    if (tid == 0) {
+      tempOutputGradWeight[outputOffset] = temp;
+    }
+  }
+}
+
+template <int FS, int SB, typename scalar_t>
+__global__ void lightconv_grad_wrt_weights_secondpass_kernel(
+    const float* input,
+    const int minibatch,
+    const int numFiltersInBlock,
+    scalar_t* output) {
+  assert(blockDim.x == SB);
+  const int tid = threadIdx.x;
+
+  // What is the id within a minibatch
+  const int filterIdx = blockIdx.x;
+  const int filterWeightIdx = blockIdx.y;
+
+  const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock +
+      filterWeightIdx * minibatch * numFiltersInBlock;
+  const float* tempInput = &input[inputOffset];
+
+  int readIndex = tid;
+
+  float sum = float(0.0);
+  while (readIndex < (minibatch * numFiltersInBlock)) {
+    sum += tempInput[readIndex];
+    readIndex += SB;
+  }
+
+  float temp = blockReduce(sum);
+
+  if (tid == 0) {
+    output[blockIdx.x * FS + blockIdx.y] = temp;
+  }
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
new file mode 100644
index 0000000000..e7e597f474
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
@@ -0,0 +1,137 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import lightconv_cuda
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from torch import nn
+from torch.autograd import Function
+
+
+class lightconvFunction(Function):
+    @staticmethod
+    def forward(ctx, x, weights, padding_l):
+        ctx.padding_l = padding_l
+        outputs = lightconv_cuda.forward(x, weights, padding_l)
+        variables = [x, weights]
+        ctx.save_for_backward(*variables)
+        return outputs[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        outputs = lightconv_cuda.backward(
+            grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors
+        )
+        grad_input, grad_weights = outputs
+        return grad_input, grad_weights, None
+
+
+@with_incremental_state
+class LightconvLayer(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding_l=None,
+        weight_softmax=False,
+        num_heads=1,
+        weight_dropout=0.0,
+        bias=False,
+    ):
+        super(LightconvLayer, self).__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_softmax = weight_softmax
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+
+        self.weight = nn.Parameter(torch.Tensor(num_heads, kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+        self.reset_parameters()
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        for k, v in state_dict.items():
+            if k.endswith(prefix + "weight"):
+                if v.dim() == 3 and v.size(1) == 1:
+                    state_dict[k] = v.squeeze(1)
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.0)
+
+    def forward(self, x, incremental_state=None):
+
+        # during inference time, incremental BMM is faster
+        if incremental_state is not None:
+            T, B, C = x.size()
+            K, H = self.kernel_size, self.num_heads
+            R = C // H
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(
+                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
+                )
+            x_unfold = x_unfold.view(T * B * H, R, -1)
+
+            weight = self.weight
+            if self.weight_softmax:
+                weight = F.softmax(weight.float(), dim=1).type_as(weight)
+
+            weight = weight[:, -x_unfold.size(2) :]
+
+            K = weight.size(1)
+
+            weight = (
+                weight.view(1, H, K)
+                .expand(T * B, H, K)
+                .contiguous()
+                .view(T * B * H, K, 1)
+            )
+
+            weight = self.weight_dropout_module(weight)
+            output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
+            output = output.view(T, B, C)
+            return output
+
+        # during training time, use CUDA kernel
+        else:
+            x = x.permute(1, 2, 0).contiguous()
+            weight = self.weight
+            if self.weight_softmax:
+                weight = F.softmax(self.weight, -1)
+            if self.weight_dropout_module.p:
+                weight = self.weight_dropout_module(weight)
+            return lightconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1)
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, "input_buffer")
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(
+            self, incremental_state, "input_buffer", new_buffer
+        )
+
+    def half(self):
+        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
new file mode 100644
index 0000000000..052635be79
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+setup(
+    name="lightconv_layer",
+    ext_modules=[
+        CUDAExtension(
+            "lightconv_cuda",
+            [
+                "lightconv_cuda.cpp",
+                "lightconv_cuda_kernel.cu",
+            ],
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
new file mode 100644
index 0000000000..ec11a95079
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
@@ -0,0 +1,310 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.unfold import unfold1d
+
+
+def LightweightConv(
+    input_size,
+    kernel_size=1,
+    padding_l=None,
+    num_heads=1,
+    weight_dropout=0.0,
+    weight_softmax=False,
+    bias=False,
+):
+    if torch.cuda.is_available():
+        try:
+            from fairseq.modules.lightconv_layer import LightconvLayer
+
+            return LightconvLayer(
+                input_size,
+                kernel_size=kernel_size,
+                padding_l=padding_l,
+                num_heads=num_heads,
+                weight_dropout=weight_dropout,
+                weight_softmax=weight_softmax,
+                bias=bias,
+            )
+        except ImportError as e:
+            print(e)
+    return LightweightConv1dTBC(
+        input_size,
+        kernel_size=kernel_size,
+        padding_l=padding_l,
+        num_heads=num_heads,
+        weight_dropout=weight_dropout,
+        weight_softmax=weight_softmax,
+        bias=bias,
+    )
+
+
+class LightweightConv1d(nn.Module):
+    """Lightweight Convolution assuming the input is BxCxT
+    This is just an example that explains LightConv clearer than the TBC version.
+    We don't use this module in the model.
+
+    Args:
+        input_size: # of channels of the input and output
+        kernel_size: convolution channels
+        padding: padding
+        num_heads: number of heads used. The weight is of shape
+            `(num_heads, 1, kernel_size)`
+        weight_softmax: normalize the weight with softmax before the convolution
+
+    Shape:
+        Input: BxCxT, i.e. (batch_size, input_size, timesteps)
+        Output: BxCxT, i.e. (batch_size, input_size, timesteps)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias: the learnable bias of the module of shape `(input_size)`
+    """
+
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding=0,
+        num_heads=1,
+        weight_softmax=False,
+        bias=False,
+        weight_dropout=0.0,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.num_heads = num_heads
+        self.padding = padding
+        self.weight_softmax = weight_softmax
+        self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
+
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.0)
+
+    def forward(self, input):
+        """
+        input size: B x C x T
+        output size: B x C x T
+        """
+        B, C, T = input.size()
+        H = self.num_heads
+
+        weight = self.weight
+        if self.weight_softmax:
+            weight = F.softmax(weight, dim=-1)
+
+        weight = self.weight_dropout_module(weight)
+        # Merge every C/H entries into the batch dimension (C = self.input_size)
+        # B x C x T -> (B * C/H) x H x T
+        # One can also expand the weight to C x 1 x K by a factor of C/H
+        # and do not reshape the input instead, which is slow though
+        input = input.view(-1, H, T)
+        output = F.conv1d(input, weight, padding=self.padding, groups=self.num_heads)
+        output = output.view(B, C, T)
+        if self.bias is not None:
+            output = output + self.bias.view(1, -1, 1)
+
+        return output
+
+
+@with_incremental_state
+class LightweightConv1dTBC(nn.Module):
+    """Lightweight Convolution assuming the input is TxBxC
+    Args:
+        input_size: # of channels of the input
+        kernel_size: convolution channels
+        padding_l: padding to the left when using "same" padding
+        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
+        weight_dropout: the drop rate of the DropConnect to drop the weight
+        weight_softmax: normalize the weight with softmax before the convolution
+        bias: use bias
+
+    Shape:
+        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
+        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias:   the learnable bias of the module of shape `(input_size)`
+    """
+
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding_l=None,
+        num_heads=1,
+        weight_dropout=0.0,
+        weight_softmax=False,
+        bias=False,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.padding_l = padding_l
+        self.num_heads = num_heads
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+        self.weight_softmax = weight_softmax
+
+        self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+
+        self.reset_parameters()
+        self.onnx_trace = False
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.0)
+
+    def forward(self, x, incremental_state=None, unfold=False):
+        """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
+        args:
+            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
+            incremental_state: A dict to keep the state
+            unfold: unfold the input or not. If not, we use the matrix trick instead
+        """
+        unfold = unfold or (incremental_state is not None)
+
+        if unfold:
+            output = self._forward_unfolded(x, incremental_state)
+        else:
+            output = self._forward_expanded(x, incremental_state)
+
+        if self.bias is not None:
+            output = output + self.bias.view(1, 1, -1)
+        return output
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def _forward_unfolded(self, x, incremental_state):
+        """The conventional implementation of convolutions.
+        Unfolding the input by having a window shifting to the right."""
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight.view(H, K)
+        if incremental_state is not None:
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = x.new()
+            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+            if self.kernel_size > 1:
+                self._set_input_buffer(
+                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
+                )
+            x_unfold = x_unfold.view(T * B * H, R, -1)
+        else:
+            # unfold the input: T x B x C --> T' x B x C x K
+            x_unfold = unfold1d(x, self.kernel_size, self.padding_l, 0)
+            x_unfold = x_unfold.view(T * B * H, R, K)
+
+        if self.weight_softmax:
+            weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as(
+                weight
+            )
+
+        if incremental_state is not None:
+            weight = weight[:, -x_unfold.size(2) :]
+            K = weight.size(1)
+
+        weight = (
+            weight.view(1, H, K).expand(T * B, H, K).contiguous().view(T * B * H, K, 1)
+        )
+
+        weight = self.weight_dropout_module(weight)
+        output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
+        output = output.view(T, B, C)
+        return output
+
+    def _forward_expanded(self, x, incremental_state):
+        """Turn the convolution filters into band matrices and do matrix multiplication.
+        This is faster when the sequence is short, but less memory efficient.
+        This is not used in the decoder during inference.
+        """
+        T, B, C = x.size()
+        K, H = self.kernel_size, self.num_heads
+        R = C // H
+        assert R * H == C == self.input_size
+
+        weight = self.weight.view(H, K)
+        if self.weight_softmax:
+            weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as(
+                weight
+            )
+        weight = weight.view(1, H, K).expand(T * B, H, K).contiguous()
+        weight = weight.view(T, B * H, K).transpose(0, 1)
+
+        x = x.view(T, B * H, R).transpose(0, 1)
+        P = self.padding_l
+        if K > T and P == K - 1:
+            weight = weight.narrow(2, K - T, T)
+            K, P = T, T - 1
+        # turn the convolution filters into band matrices
+        weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
+        weight_expanded.as_strided((B * H, T, K), (T * (T + K - 1), T + K, 1)).copy_(
+            weight
+        )
+        weight_expanded = weight_expanded.narrow(2, P, T)
+        weight_expanded = self.weight_dropout_module(weight_expanded)
+
+        output = torch.bmm(weight_expanded, x)
+        output = output.transpose(0, 1).contiguous().view(T, B, C)
+        return output
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(1, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    def _get_input_buffer(self, incremental_state):
+        return utils.get_incremental_state(self, incremental_state, "input_buffer")
+
+    def _set_input_buffer(self, incremental_state, new_buffer):
+        return utils.set_incremental_state(
+            self, incremental_state, "input_buffer", new_buffer
+        )
+
+    def extra_repr(self):
+        s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, bias={}".format(
+            self.input_size,
+            self.kernel_size,
+            self.padding_l,
+            self.num_heads,
+            self.weight_softmax,
+            self.bias is not None,
+        )
+        if self.weight_dropout_module.p > 0.0:
+            s += ", weight_dropout={}".format(self.weight_dropout_module.p)
+        return s
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
new file mode 100644
index 0000000000..1c7a9f09ac
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+
+from .conv_tbc import ConvTBC
+
+from typing import Dict, Optional
+from torch import Tensor
+
+
+@with_incremental_state
+class LinearizedConvolution(ConvTBC):
+    """An optimized version of nn.Conv1d.
+
+    At training time, this module uses ConvTBC, which is an optimized version
+    of Conv1d. At inference time, it optimizes incremental generation (i.e.,
+    one time step at a time) by replacing the convolutions with linear layers.
+    Note that the input order changes from training to inference.
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self._linearized_weight = None
+        self.register_backward_hook(self._clear_linearized_weight)
+
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = ConvTBC.state_dict(self, destination, prefix, keep_vars=keep_vars)
+        # don't store redundant _linearized_weight in checkpoints
+        if prefix + "_linearized_weight" in state:
+            del state[prefix + "_linearized_weight"]
+        return state
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        if prefix + "_linearized_weight" in state_dict:
+            del state_dict[prefix + "_linearized_weight"]
+
+    @torch.jit.export
+    def forward(
+        self,
+        input,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        """
+        Args:
+            incremental_state: Used to buffer signal; if not None, then input is
+                expected to contain a single frame. If the input order changes
+                between time steps, call reorder_incremental_state.
+        Input:
+            Time x Batch x Channel during training
+            Batch x Time x Channel during inference
+        """
+        if incremental_state is None:
+            output = self.conv_tbc(input)
+            if self.kernel_size[0] > 1 and self.padding[0] > 0:
+                # remove future timesteps added by padding
+                output = output[: -self.padding[0], :, :]
+            return output
+
+        # reshape weight
+        weight = self._get_linearized_weight()
+        kw = self.kernel_size[0]
+
+        bsz = input.size(0)  # input: bsz x len x dim
+        if kw > 1:
+            input = input.data
+            input_buffer = self._get_input_buffer(incremental_state)
+            if input_buffer is None:
+                input_buffer = input.new(bsz, kw, input.size(2)).zero_()
+                self._set_input_buffer(incremental_state, input_buffer)
+            else:
+                # shift buffer
+                input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone()
+            # append next input
+            input_buffer[:, -1, :] = input[:, -1, :]
+            input = input_buffer
+        with torch.no_grad():
+            output = F.linear(input.view(bsz, -1), weight, self.bias)
+        return output.view(bsz, 1, -1)
+
+    @torch.jit.unused
+    def reorder_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order,
+    ):
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            input_buffer = input_buffer.index_select(0, new_order)
+            self._set_input_buffer(incremental_state, input_buffer)
+
+    @torch.jit.unused
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ):
+        return utils.get_incremental_state(self, incremental_state, "input_buffer")
+
+    @torch.jit.unused
+    def _set_input_buffer(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_buffer,
+    ):
+        return utils.set_incremental_state(
+            self, incremental_state, "input_buffer", new_buffer
+        )
+
+    @torch.jit.unused
+    def _get_linearized_weight(self):
+        if self._linearized_weight is None:
+            kw = self.kernel_size[0]
+            weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
+            assert weight.size() == (self.out_channels, kw, self.in_channels)
+            return weight.view(self.out_channels, -1)
+        return self._linearized_weight
+
+    @torch.jit.unused
+    def _clear_linearized_weight(self, *args):
+        self._linearized_weight = None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
new file mode 100644
index 0000000000..dbbbfb9f2d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+
+
+class LocationAttention(nn.Module):
+    """
+    Attention-Based Models for Speech Recognition
+    https://arxiv.org/pdf/1506.07503.pdf
+
+    :param int encoder_dim: # projection-units of encoder
+    :param int decoder_dim: # units of decoder
+    :param int attn_dim: attention dimension
+    :param int conv_dim: # channels of attention convolution
+    :param int conv_kernel_size: filter size of attention convolution
+    """
+
+    def __init__(
+        self,
+        attn_dim,
+        encoder_dim,
+        decoder_dim,
+        attn_state_kernel_size,
+        conv_dim,
+        conv_kernel_size,
+        scaling=2.0,
+    ):
+        super(LocationAttention, self).__init__()
+        self.attn_dim = attn_dim
+        self.decoder_dim = decoder_dim
+        self.scaling = scaling
+        self.proj_enc = nn.Linear(encoder_dim, attn_dim)
+        self.proj_dec = nn.Linear(decoder_dim, attn_dim, bias=False)
+        self.proj_attn = nn.Linear(conv_dim, attn_dim, bias=False)
+        self.conv = nn.Conv1d(
+            attn_state_kernel_size,
+            conv_dim,
+            2 * conv_kernel_size + 1,
+            padding=conv_kernel_size,
+            bias=False,
+        )
+        self.proj_out = nn.Sequential(nn.Tanh(), nn.Linear(attn_dim, 1))
+
+        self.proj_enc_out = None  # cache
+
+    def clear_cache(self):
+        self.proj_enc_out = None
+
+    def forward(self, encoder_out, encoder_padding_mask, decoder_h, attn_state):
+        """
+        :param torch.Tensor encoder_out: padded encoder hidden state B x T x D
+        :param torch.Tensor encoder_padding_mask: encoder padding mask
+        :param torch.Tensor decoder_h: decoder hidden state B x D
+        :param torch.Tensor attn_prev: previous attention weight B x K x T
+        :return: attention weighted encoder state (B, D)
+        :rtype: torch.Tensor
+        :return: previous attention weights (B x T)
+        :rtype: torch.Tensor
+        """
+        bsz, seq_len, _ = encoder_out.size()
+        if self.proj_enc_out is None:
+            self.proj_enc_out = self.proj_enc(encoder_out)
+
+        # B x K x T -> B x C x T
+        attn = self.conv(attn_state)
+        # B x C x T -> B x T x C -> B x T x D
+        attn = self.proj_attn(attn.transpose(1, 2))
+
+        if decoder_h is None:
+            decoder_h = encoder_out.new_zeros(bsz, self.decoder_dim)
+        dec_h = self.proj_dec(decoder_h).view(bsz, 1, self.attn_dim)
+
+        out = self.proj_out(attn + self.proj_enc_out + dec_h).squeeze(2)
+        out.masked_fill_(encoder_padding_mask, -float("inf"))
+
+        w = F.softmax(self.scaling * out, dim=1)
+        c = torch.sum(encoder_out * w.view(bsz, seq_len, 1), dim=1)
+        return c, w
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
new file mode 100644
index 0000000000..273308951f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+
+
+class LSTMCellWithZoneOut(nn.Module):
+    """
+    Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations
+    https://arxiv.org/abs/1606.01305
+    """
+
+    def __init__(
+        self, prob: float, input_size: int, hidden_size: int, bias: bool = True
+    ):
+        super(LSTMCellWithZoneOut, self).__init__()
+        self.lstm_cell = nn.LSTMCell(input_size, hidden_size, bias=bias)
+        self.prob = prob
+        if prob > 1.0 or prob < 0.0:
+            raise ValueError(
+                "zoneout probability must be in the range from " "0.0 to 1.0."
+            )
+
+    def zoneout(self, h, next_h, prob):
+        if isinstance(h, tuple):
+            return tuple([self.zoneout(h[i], next_h[i], prob) for i in range(len(h))])
+
+        if self.training:
+            mask = h.new_zeros(*h.size()).bernoulli_(prob)
+            return mask * h + (1 - mask) * next_h
+
+        return prob * h + (1 - prob) * next_h
+
+    def forward(self, x, h):
+        return self.zoneout(h, self.lstm_cell(x, h), self.prob)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
new file mode 100644
index 0000000000..262132dfe7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
@@ -0,0 +1,910 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import Parameter
+
+try:
+    from xformers.components.attention import build_attention
+    from xformers.components.attention.utils import maybe_merge_masks
+
+    _xformers_available = True
+except ImportError:
+    _xformers_available = False
+
+from fairseq import utils
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from fairseq.models.fairseq_incremental_decoder import FairseqIncrementalDecoder
+
+
+# TODO: move this into xformers?
+# TODO: uint8 input type should just output a bool
+def _mask_for_xformers(mask: Tensor, to_dtype: Optional[torch.dtype] = None):
+    """
+    call to pytorch multihead accepts three mask types:
+        - ByteTensor where non-zero means to mask
+        - FloatTensor which is an additive mask
+        - BoolTensor where True means to mask
+    xFormers currently accepts boolean and additive maks. For boolean masks
+    the values have opposite meaning. For a BoolTensor True mean to keep the value.
+    """
+    float_types = [torch.float, torch.float16]
+    # If an input mask is a float it is an additive mask. Otherwise it is either uint8 or bool.
+    additive = mask.dtype in float_types
+    # If to_dype is not specified, keep same dtype as mask.
+    to_dtype = mask.dtype if to_dtype is None else to_dtype
+    to_additive = to_dtype in float_types
+
+    if additive:
+        if to_additive:
+            return mask.to(to_dtype)
+        mask = mask < 0
+
+    if to_additive:
+        # return additive mask
+        new_mask = torch.zeros_like(mask, dtype=to_dtype)
+        new_mask = new_mask.masked_fill_(mask, -float("inf"))
+        return new_mask
+
+    # In xFormers True is value to keep rather than value to mask
+    mask = ~mask.to(torch.bool)
+    mask = mask.to(to_dtype)
+    return mask
+
+
+class MultiheadAttention(FairseqIncrementalDecoder):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        dictionary=None,
+        q_noise=0.0,
+        qn_block_size=8,
+        # TODO: pass in config rather than string.
+        # config defined in xformers.components.attention.AttentionConfig
+        xformers_att_config: Optional[str] = None,
+        xformers_blocksparse_layout: Optional[
+            torch.Tensor
+        ] = None,  # This should be part of the config
+        xformers_blocksparse_blocksize: Optional[
+            int
+        ] = 16,  # This should be part of the config
+    ):
+        super().__init__(dictionary)
+
+        xformers_att_config = utils.eval_str_dict(xformers_att_config)
+        self.use_xformers = xformers_att_config is not None
+        if self.use_xformers and not _xformers_available:
+            raise ImportError("\n\n  Please install xFormers.")
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+        self.beam_size = 1
+        self.reset_parameters()
+
+        if self.use_xformers:
+            xformers_att_config["dropout"] = xformers_att_config.get("dropout", dropout)
+            xformers_att_config["num_heads"] = xformers_att_config.get(
+                "num_heads", num_heads
+            )
+
+            if xformers_blocksparse_layout is not None:
+                # Could be part of a single config passed only once
+                xformers_att_config["block_size"] = xformers_blocksparse_blocksize
+                xformers_att_config["layout"] = xformers_blocksparse_layout
+                xformers_att_config["name"] = "blocksparse"
+
+            self.attention = build_attention(xformers_att_config)
+
+        self.onnx_trace = False
+        self.skip_embed_dim_check = False
+        self.init_incremental_state()
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+
+    def _get_reserve_head_index(self, num_heads_to_keep: int):
+        k_proj_heads_norm = []
+        q_proj_heads_norm = []
+        v_proj_heads_norm = []
+
+        for i in range(self.num_heads):
+            start_idx = i * self.head_dim
+            end_idx = (i + 1) * self.head_dim
+            k_proj_heads_norm.append(
+                torch.sum(
+                    torch.abs(
+                        self.k_proj.weight[
+                            start_idx:end_idx,
+                        ]
+                    )
+                ).tolist()
+                + torch.sum(torch.abs(self.k_proj.bias[start_idx:end_idx])).tolist()
+            )
+            q_proj_heads_norm.append(
+                torch.sum(
+                    torch.abs(
+                        self.q_proj.weight[
+                            start_idx:end_idx,
+                        ]
+                    )
+                ).tolist()
+                + torch.sum(torch.abs(self.q_proj.bias[start_idx:end_idx])).tolist()
+            )
+            v_proj_heads_norm.append(
+                torch.sum(
+                    torch.abs(
+                        self.v_proj.weight[
+                            start_idx:end_idx,
+                        ]
+                    )
+                ).tolist()
+                + torch.sum(torch.abs(self.v_proj.bias[start_idx:end_idx])).tolist()
+            )
+
+        heads_norm = []
+        for i in range(self.num_heads):
+            heads_norm.append(
+                k_proj_heads_norm[i] + q_proj_heads_norm[i] + v_proj_heads_norm[i]
+            )
+
+        sorted_head_index = sorted(
+            range(self.num_heads), key=lambda k: heads_norm[k], reverse=True
+        )
+        reserve_head_index = []
+        for i in range(num_heads_to_keep):
+            start = sorted_head_index[i] * self.head_dim
+            end = (sorted_head_index[i] + 1) * self.head_dim
+            reserve_head_index.append((start, end))
+        return reserve_head_index
+
+    def _adaptive_prune_heads(self, reserve_head_index: List[Tuple[int, int]]):
+        new_q_weight = []
+        new_q_bias = []
+        new_k_weight = []
+        new_k_bias = []
+        new_v_weight = []
+        new_v_bias = []
+        new_out_proj_weight = []
+
+        for ele in reserve_head_index:
+            start_idx, end_idx = ele
+            new_q_weight.append(
+                self.q_proj.weight[
+                    start_idx:end_idx,
+                ]
+            )
+            new_q_bias.append(self.q_proj.bias[start_idx:end_idx])
+
+            new_k_weight.append(
+                self.k_proj.weight[
+                    start_idx:end_idx,
+                ]
+            )
+
+            new_k_bias.append(self.k_proj.bias[start_idx:end_idx])
+
+            new_v_weight.append(
+                self.v_proj.weight[
+                    start_idx:end_idx,
+                ]
+            )
+            new_v_bias.append(self.v_proj.bias[start_idx:end_idx])
+
+            new_out_proj_weight.append(self.out_proj.weight[:, start_idx:end_idx])
+
+        new_q_weight = torch.cat(new_q_weight).detach()
+        new_k_weight = torch.cat(new_k_weight).detach()
+        new_v_weight = torch.cat(new_v_weight).detach()
+        new_out_proj_weight = torch.cat(new_out_proj_weight, dim=-1).detach()
+        new_q_weight.requires_grad = True
+        new_k_weight.requires_grad = True
+        new_v_weight.requires_grad = True
+        new_out_proj_weight.requires_grad = True
+
+        new_q_bias = torch.cat(new_q_bias).detach()
+        new_q_bias.requires_grad = True
+
+        new_k_bias = torch.cat(new_k_bias).detach()
+        new_k_bias.requires_grad = True
+
+        new_v_bias = torch.cat(new_v_bias).detach()
+        new_v_bias.requires_grad = True
+
+        self.q_proj.weight = torch.nn.Parameter(new_q_weight)
+        self.q_proj.bias = torch.nn.Parameter(new_q_bias)
+
+        self.k_proj.weight = torch.nn.Parameter(new_k_weight)
+        self.k_proj.bias = torch.nn.Parameter(new_k_bias)
+
+        self.v_proj.weight = torch.nn.Parameter(new_v_weight)
+        self.v_proj.bias = torch.nn.Parameter(new_v_bias)
+
+        self.out_proj.weight = torch.nn.Parameter(new_out_proj_weight)
+
+        self.num_heads = len(reserve_head_index)
+        self.embed_dim = self.head_dim * self.num_heads
+        self.q_proj.out_features = self.embed_dim
+        self.k_proj.out_features = self.embed_dim
+        self.v_proj.out_features = self.embed_dim
+
+    def _set_skip_embed_dim_check(self):
+        self.skip_embed_dim_check = True
+
+    def _pad_masks(
+        self,
+        key_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor],
+    ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        if attn_mask is not None:
+            shape = attn_mask.size()[:-1] + torch.Size([1])
+            attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(shape)], dim=-1)
+        if key_padding_mask is not None:
+            shape = key_padding_mask.size()[:-1] + torch.Size([1])
+            key_padding_mask = torch.cat(
+                [
+                    key_padding_mask,
+                    key_padding_mask.new_zeros(shape),
+                ],
+                dim=-1,
+            )
+        return key_padding_mask, attn_mask
+
+    def _add_bias(
+        self,
+        k: Tensor,
+        v: Tensor,
+        key_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor],
+        bsz: int,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        assert self.bias_k is not None
+        assert self.bias_v is not None
+        k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+        v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+        key_padding_mask, attn_mask = self._pad_masks(
+            key_padding_mask=key_padding_mask, attn_mask=attn_mask
+        )
+        return k, v, key_padding_mask, attn_mask
+
+    def _append_zero_attn(
+        self,
+        k: Tensor,
+        v: Tensor,
+        key_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        zero_attn_shape = k.size()[:-2] + torch.Size([1]) + k.size()[-1:]
+        k = torch.cat(
+            [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=-2
+        )
+        v = torch.cat(
+            [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=-2
+        )
+        key_padding_mask, attn_mask = self._pad_masks(
+            key_padding_mask=key_padding_mask, attn_mask=attn_mask
+        )
+        return k, v, key_padding_mask, attn_mask
+
+    def _xformers_attn_forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+
+        tgt_len, bsz, embed_dim = query.size()
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == tgt_len
+
+        if self.self_attention:
+            key = query
+            value = query
+        elif self.encoder_decoder_attention:
+            value = key
+
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k, v, attn_mask, key_padding_mask = self._add_bias(
+                k, v, attn_mask, key_padding_mask, bsz
+            )
+
+        def fold_heads(x):
+            return (
+                x.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        def split_heads(x):
+            return (
+                x.contiguous()
+                .view(-1, bsz, self.num_heads, self.head_dim)
+                .transpose(0, 1)
+                .transpose(1, 2)
+            )
+
+        massage = split_heads if self.attention.requires_head_dimension else fold_heads
+        q = massage(q)
+        if k is not None:
+            k = massage(k)
+        if v is not None:
+            v = massage(v)
+
+        if self.add_zero_attn:
+            k, v, key_padding_mask, attn_mask = self._append_zero_attn(
+                k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask
+            )
+
+        kwargs = {}
+
+        if attn_mask is not None and self.attention.supports_attention_mask:
+            attn_mask = _mask_for_xformers(attn_mask, to_dtype=q.dtype)
+            kwargs["att_mask"] = attn_mask
+
+        if key_padding_mask is not None:
+            key_padding_mask = _mask_for_xformers(key_padding_mask, to_dtype=torch.bool)
+            if not self.attention.requires_separate_masks:
+                attn_mask = maybe_merge_masks(
+                    attn_mask,
+                    key_padding_mask,
+                    batch_size=bsz,
+                    src_len=k.size(-2),
+                    tgt_len=q.size(-2),
+                    num_heads=self.num_heads,
+                )
+                key_padding_mask = None
+                kwargs["att_mask"] = attn_mask
+            if self.attention.supports_key_padding_mask:
+                kwargs["key_padding_mask"] = key_padding_mask
+
+        y = self.attention(q, k, v, **kwargs)
+
+        y = (
+            y.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .flatten(start_dim=2, end_dim=3)
+            .transpose(0, 1)
+        )
+        assert list(y.size()) == [tgt_len, bsz, embed_dim]
+
+        # Dropout not needed because already applied in attention.
+        # It is applied to the attention weights before matmul with v.
+        y = self.out_proj(y)
+
+        # TODO: support returning attention weights if needed.
+        return y, None
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        is_tpu = query.device.type == "xla"
+
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        if not self.skip_embed_dim_check:
+            assert (
+                embed_dim == self.embed_dim
+            ), f"query dim {embed_dim} != {self.embed_dim}"
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert value is not None
+                assert src_len, key_bsz == value.shape[:2]
+
+        if (
+            not self.onnx_trace
+            and not is_tpu  # don't use PyTorch version on TPUs
+            and incremental_state is None
+            and not static_kv
+            # A workaround for quantization to work. Otherwise JIT compilation
+            # treats bias in linear module as method.
+            and not torch.jit.is_scripting()
+            # The Multihead attention implemented in pytorch forces strong dimension check
+            # for input embedding dimention and K,Q,V projection dimension.
+            # Since pruning will break the dimension check and it is not easy to modify the pytorch API,
+            # it is preferred to bypass the pytorch MHA when we need to skip embed_dim_check
+            and not self.skip_embed_dim_check
+        ):
+            assert key is not None and value is not None
+
+            if self.use_xformers:
+                return self._xformers_attn_forward(
+                    query, key, value, key_padding_mask, need_weights, attn_mask
+                )
+
+            else:
+                return F.multi_head_attention_forward(
+                    query,
+                    key,
+                    value,
+                    self.embed_dim,
+                    self.num_heads,
+                    torch.empty([0]),
+                    torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                    self.bias_k,
+                    self.bias_v,
+                    self.add_zero_attn,
+                    self.dropout_module.p,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    self.training or self.dropout_module.apply_during_inference,
+                    key_padding_mask.bool() if key_padding_mask is not None else None,
+                    need_weights,
+                    attn_mask,
+                    use_separate_proj_weight=True,
+                    q_proj_weight=self.q_proj.weight,
+                    k_proj_weight=self.k_proj.weight,
+                    v_proj_weight=self.v_proj.weight,
+                )
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                if self.beam_size > 1 and bsz == key.size(1):
+                    # key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
+                    key = key.view(key.size(0), -1, self.beam_size, key.size(2))[
+                        :, :, 0, :
+                    ]
+                    if key_padding_mask is not None:
+                        key_padding_mask = key_padding_mask.view(
+                            -1, self.beam_size, key_padding_mask.size(1)
+                        )[:, 0, :]
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k, v, attn_mask, key_padding_mask = self._add_bias(
+                k, v, attn_mask, key_padding_mask, bsz
+            )
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        kv_bsz = bsz  # need default value for scripting
+        if k is not None:
+            kv_bsz = k.size(1)
+            k = (
+                k.contiguous()
+                .view(-1, kv_bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, kv_bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                kv_bsz = _prev_key.size(0)
+                prev_key = _prev_key.view(kv_bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                assert kv_bsz == _prev_value.size(0)
+                prev_value = _prev_value.view(
+                    kv_bsz * self.num_heads, -1, self.head_dim
+                )
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=kv_bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+
+            saved_state["prev_key"] = k.view(kv_bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(
+                kv_bsz, self.num_heads, -1, self.head_dim
+            )
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == kv_bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k, v, key_padding_mask, attn_mask = self._append_zero_attn(
+                k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask
+            )
+
+        if self.encoder_decoder_attention and bsz != kv_bsz:
+            attn_weights = torch.einsum(
+                "bxhtd,bhsd->bxhts",
+                q.view((kv_bsz, -1, self.num_heads) + q.size()[1:]),
+                k.view((kv_bsz, self.num_heads) + k.size()[1:]),
+            )
+            attn_weights = attn_weights.reshape((-1,) + attn_weights.size()[-2:])
+        else:
+            attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            if not is_tpu:
+                attn_weights = attn_weights.view(
+                    kv_bsz, -1, self.num_heads, tgt_len, src_len
+                )
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1)
+                    .unsqueeze(2)
+                    .unsqueeze(3)
+                    .to(torch.bool),
+                    float("-inf"),
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        attn: Optional[Tensor] = None
+        if self.encoder_decoder_attention and bsz != kv_bsz:
+            attn = torch.einsum(
+                "bxhts,bhsd->bxhtd",
+                attn_probs.view(
+                    (
+                        kv_bsz,
+                        -1,
+                        self.num_heads,
+                    )
+                    + attn_probs.size()[1:]
+                ),
+                v.view(
+                    (
+                        kv_bsz,
+                        self.num_heads,
+                    )
+                    + v.size()[1:]
+                ),
+            )
+            attn = attn.reshape((-1,) + attn.size()[-2:])
+        else:
+            attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, self.embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - prev_key_padding_mask.size(1)),
+                    device=prev_key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat(
+                    [prev_key_padding_mask.float(), filler.float()], dim=1
+                )
+            else:
+                new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - key_padding_mask.size(1)),
+                    device=key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat(
+                    [filler.float(), key_padding_mask.float()], dim=1
+                )
+            else:
+                new_key_padding_mask = key_padding_mask.float()
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order: Tensor,
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention:
+                        if input_buffer_k.size(0) * self.beam_size == new_order.size(0):
+                            return incremental_state
+                        elif self.beam_size > 1:
+                            input_buffer[k] = input_buffer_k.index_select(
+                                0,
+                                new_order.reshape(-1, self.beam_size)[:, 0]
+                                // self.beam_size,
+                            )
+                        else:
+                            input_buffer[k] = input_buffer_k.index_select(0, new_order)
+                    else:
+                        input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+
+    def set_beam_size(self, beam_size):
+        """Used for effiecient beamable enc-dec attention"""
+        self.beam_size = beam_size
+
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+
+    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+
+                keys_to_remove.append(k)
+
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim : 2 * dim
+                    ]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+
+                    keys_to_remove.append(prefix + "in_proj_bias")
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
new file mode 100644
index 0000000000..97cd474b51
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+
+from .learned_positional_embedding import LearnedPositionalEmbedding
+from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
+
+
+def PositionalEmbedding(
+    num_embeddings: int,
+    embedding_dim: int,
+    padding_idx: int,
+    learned: bool = False,
+):
+    if learned:
+        # if padding_idx is specified then offset the embedding ids by
+        # this index and adjust num_embeddings appropriately
+        # TODO: The right place for this offset would be inside
+        # LearnedPositionalEmbedding. Move this there for a cleaner implementation.
+        if padding_idx is not None:
+            num_embeddings = num_embeddings + padding_idx + 1
+        m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+        nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+        if padding_idx is not None:
+            nn.init.constant_(m.weight[padding_idx], 0)
+    else:
+        m = SinusoidalPositionalEmbedding(
+            embedding_dim,
+            padding_idx,
+            init_size=num_embeddings + padding_idx + 1,
+        )
+    return m
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
new file mode 100644
index 0000000000..67f6353539
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+import math
+import torch
+
+
+class PositionalEncoding(nn.Module):
+    """Positional encoding.
+
+    Args:
+        d_model: Embedding dimension.
+        dropout_rate: Dropout rate.
+        max_len: Maximum input length.
+        reverse: Whether to reverse the input position.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """Construct an PositionalEncoding object."""
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor B X T X C
+        Returns:
+            torch.Tensor: Encoded tensor B X T X C
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class RelPositionalEncoding(nn.Module):
+    """Relative positional encoding module (new implementation).
+
+    Args:
+        d_model: Embedding dimension.
+        dropout_rate: Dropout rate.
+        max_len: Maximum input length.
+    """
+
+    def __init__(self, max_len, d_model):
+        """Construct an PositionalEncoding object."""
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x : Input tensor T X B X C.
+        Returns:
+            torch.Tensor: Encoded tensor T X B X C.
+
+        """
+        x = x.transpose(0, 1)  # Change TBC to BTC
+        self.extend_pe(x)
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1),
+        ]
+        pos_emb = pos_emb.transpose(0, 1)  # change to TBC
+        return pos_emb
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
new file mode 100644
index 0000000000..d777dfbb6c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+def quant_noise(module, p, block_size):
+    """
+    Wraps modules and applies quantization noise to the weights for
+    subsequent quantization with Iterative Product Quantization as
+    described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights,
+          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper
+          which consists in randomly dropping blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+
+    # 2D matrix
+    if not is_conv:
+        assert (
+            module.weight.size(1) % block_size == 0
+        ), "Input features must be a multiple of block sizes"
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            assert (
+                module.in_channels % block_size == 0
+            ), "Input channels must be a multiple of block sizes"
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            assert k % block_size == 0, "Kernel size must be a multiple of block size"
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(
+                    in_features // block_size * out_features, device=weight.device
+                )
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device,
+                    )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(
+                        weight.size(0), weight.size(1), device=weight.device
+                    )
+                    mask.bernoulli_(p)
+                    mask = (
+                        mask.unsqueeze(2)
+                        .unsqueeze(3)
+                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+                    )
+
+            # scale weights and apply mask
+            mask = mask.to(
+                torch.bool
+            )  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
new file mode 100644
index 0000000000..c142a802e0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .utils import SizeTracker, get_param, attrsetter, quantize_model_  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
new file mode 100644
index 0000000000..6f15c3e46b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
@@ -0,0 +1,211 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+from collections import Counter
+
+import torch
+
+
+class EM:
+    """
+    EM algorithm used to quantize the columns of W to minimize
+
+                         ||W - W_hat||^2
+
+    Args:
+        - W: weight matrix of size (in_features x out_features)
+        - n_iter: number of k-means iterations
+        - n_centroids: number of centroids (size of codebook)
+        - eps: for cluster reassignment when an empty cluster is found
+        - max_tentatives for cluster reassignment when an empty cluster is found
+        - verbose: print error after each iteration
+
+    Remarks:
+        - If one cluster is empty, the most populated cluster is split into
+          two clusters
+        - All the relevant dimensions are specified in the code
+    """
+
+    def __init__(
+        self, W, n_centroids=256, n_iter=20, eps=1e-6, max_tentatives=30, verbose=True
+    ):
+        self.W = W
+        self.n_centroids = n_centroids
+        self.n_iter = n_iter
+        self.eps = eps
+        self.max_tentatives = max_tentatives
+        self.verbose = verbose
+        self.centroids = torch.Tensor()
+        self.assignments = torch.Tensor()
+        self.objective = []
+
+    def initialize_centroids(self):
+        """
+        Initializes the centroids by sampling random columns from W.
+        """
+
+        in_features, out_features = self.W.size()
+        indices = torch.randint(
+            low=0, high=out_features, size=(self.n_centroids,)
+        ).long()
+        self.centroids = self.W[:, indices].t()  # (n_centroids x in_features)
+
+    def step(self, i):
+        """
+        There are two standard steps for each iteration: expectation (E) and
+        minimization (M). The E-step (assignment) is performed with an exhaustive
+        search and the M-step (centroid computation) is performed with
+        the exact solution.
+
+        Args:
+            - i: step number
+
+        Remarks:
+            - The E-step heavily uses PyTorch broadcasting to speed up computations
+              and reduce the memory overhead
+        """
+
+        # assignments (E-step)
+        distances = self.compute_distances()  # (n_centroids x out_features)
+        self.assignments = torch.argmin(distances, dim=0)  # (out_features)
+        n_empty_clusters = self.resolve_empty_clusters()
+
+        # centroids (M-step)
+        for k in range(self.n_centroids):
+            W_k = self.W[:, self.assignments == k]  # (in_features x size_of_cluster_k)
+            self.centroids[k] = W_k.mean(dim=1)  # (in_features)
+
+        # book-keeping
+        obj = (self.centroids[self.assignments].t() - self.W).norm(p=2).item()
+        self.objective.append(obj)
+        if self.verbose:
+            logging.info(
+                f"Iteration: {i},\t"
+                f"objective: {obj:.6f},\t"
+                f"resolved empty clusters: {n_empty_clusters}"
+            )
+
+    def resolve_empty_clusters(self):
+        """
+        If one cluster is empty, the most populated cluster is split into
+        two clusters by shifting the respective centroids. This is done
+        iteratively for a fixed number of tentatives.
+        """
+
+        # empty clusters
+        counts = Counter(map(lambda x: x.item(), self.assignments))
+        empty_clusters = set(range(self.n_centroids)) - set(counts.keys())
+        n_empty_clusters = len(empty_clusters)
+
+        tentatives = 0
+        while len(empty_clusters) > 0:
+            # given an empty cluster, find most populated cluster and split it into two
+            k = random.choice(list(empty_clusters))
+            m = counts.most_common(1)[0][0]
+            e = torch.randn_like(self.centroids[m]) * self.eps
+            self.centroids[k] = self.centroids[m].clone()
+            self.centroids[k] += e
+            self.centroids[m] -= e
+
+            # recompute assignments
+            distances = self.compute_distances()  # (n_centroids x out_features)
+            self.assignments = torch.argmin(distances, dim=0)  # (out_features)
+
+            # check for empty clusters
+            counts = Counter(map(lambda x: x.item(), self.assignments))
+            empty_clusters = set(range(self.n_centroids)) - set(counts.keys())
+
+            # increment tentatives
+            if tentatives == self.max_tentatives:
+                logging.info(
+                    f"Could not resolve all empty clusters, {len(empty_clusters)} remaining"
+                )
+                raise EmptyClusterResolveError
+            tentatives += 1
+
+        return n_empty_clusters
+
+    def compute_distances(self):
+        """
+        For every centroid m, computes
+
+                          ||M - m[None, :]||_2
+
+        Remarks:
+            - We rely on PyTorch's broadcasting to speed up computations
+              and reduce the memory overhead
+            - Without chunking, the sizes in the broadcasting are modified as:
+              (n_centroids x n_samples x out_features) -> (n_centroids x out_features)
+            - The broadcasting computation is automatically chunked so that
+              the tensors fit into the memory of the GPU
+        """
+
+        nb_centroids_chunks = 1
+
+        while True:
+            try:
+                return torch.cat(
+                    [
+                        (self.W[None, :, :] - centroids_c[:, :, None]).norm(p=2, dim=1)
+                        for centroids_c in self.centroids.chunk(
+                            nb_centroids_chunks, dim=0
+                        )
+                    ],
+                    dim=0,
+                )
+            except RuntimeError:
+                nb_centroids_chunks *= 2
+
+    def assign(self):
+        """
+        Assigns each column of W to its closest centroid, thus essentially
+        performing the E-step in train().
+
+        Remarks:
+            - The function must be called after train() or after loading
+              centroids using self.load(), otherwise it will return empty tensors
+        """
+
+        distances = self.compute_distances()  # (n_centroids x out_features)
+        self.assignments = torch.argmin(distances, dim=0)  # (out_features)
+
+    def save(self, path, layer):
+        """
+        Saves centroids and assignments.
+
+        Args:
+            - path: folder used to save centroids and assignments
+        """
+
+        torch.save(self.centroids, os.path.join(path, "{}_centroids.pth".format(layer)))
+        torch.save(
+            self.assignments, os.path.join(path, "{}_assignments.pth".format(layer))
+        )
+        torch.save(self.objective, os.path.join(path, "{}_objective.pth".format(layer)))
+
+    def load(self, path, layer):
+        """
+        Loads centroids and assignments from a given path
+
+        Args:
+            - path: folder use to load centroids and assignments
+        """
+
+        self.centroids = torch.load(
+            os.path.join(path, "{}_centroids.pth".format(layer))
+        )
+        self.assignments = torch.load(
+            os.path.join(path, "{}_assignments.pth".format(layer))
+        )
+        self.objective = torch.load(
+            os.path.join(path, "{}_objective.pth".format(layer))
+        )
+
+
+class EmptyClusterResolveError(Exception):
+    pass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
new file mode 100644
index 0000000000..b67c8e8ad6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qconv import PQConv2d  # NOQA
+from .qemb import PQEmbedding  # NOQA
+from .qlinear import PQLinear  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
new file mode 100644
index 0000000000..d15ec192e8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair
+
+
+class PQConv2d(nn.Module):
+    """
+    Quantized counterpart of nn.Conv2d module. Stores the centroid, the assignments
+    and the non-quantized biases. The full weight is re-instantiated at each forward
+    pass and autograd automatically computes the gradients with respect to the
+    centroids.
+
+    Args:
+        - centroids: centroids of size n_centroids x block_size
+        - assignments: assignments of the centroids to the subvectors
+          of size self.out_channels x n_blocks
+        - bias: the non-quantized bias, must be either torch.Tensor or None
+
+    Remarks:
+        - We refer the reader to the official documentation of the nn.Conv2d module
+          for the other arguments and the behavior of the module.
+        - Performance tests on GPU show that this implementation is 10% slower than
+          the non-quantized nn.Conv2d module for a standard training loop.
+        - During the backward, the gradients are averaged by cluster and not summed.
+          This explains the hook registered to the centroids.
+    """
+
+    def __init__(
+        self,
+        centroids,
+        assignments,
+        bias,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode="zeros",
+    ):
+        super(PQConv2d, self).__init__()
+        self.block_size = centroids.size(1)
+        self.n_centroids = centroids.size(0)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.padding_mode = padding_mode
+        # check compatibility
+        if in_channels // groups * np.prod(self.kernel_size) % self.block_size != 0:
+            raise ValueError("Wrong PQ sizes")
+        if len(assignments) % out_channels != 0:
+            raise ValueError("Wrong PQ sizes")
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups")
+        if out_channels % groups != 0:
+            raise ValueError("out_channels must be divisible by groups")
+        # define parameters
+        self.centroids = nn.Parameter(centroids, requires_grad=True)
+        self.register_buffer("assignments", assignments)
+        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
+        if bias is not None:
+            self.bias = nn.Parameter(bias)
+        else:
+            self.register_parameter("bias", None)
+        # register hook for averaging gradients per centroids instead of summing
+        self.centroids.register_hook(lambda x: x / self.counts[:, None])
+
+    @property
+    def weight(self):
+        return (
+            self.centroids[self.assignments]
+            .reshape(-1, self.out_channels, self.block_size)
+            .permute(1, 0, 2)
+            .reshape(
+                self.out_channels, self.in_channels // self.groups, *self.kernel_size
+            )
+        )
+
+    def forward(self, x):
+        return F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def extra_repr(self):
+        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
+        if self.padding != (0,) * len(self.padding):
+            s += ", padding={padding}"
+        if self.dilation != (1,) * len(self.dilation):
+            s += ", dilation={dilation}"
+        if self.groups != 1:
+            s += ", groups={groups}"
+        if self.bias is None:
+            s += ", bias=False"
+        if self.padding_mode != "zeros":
+            s += ", padding_mode={padding_mode}"
+        s += ", n_centroids={n_centroids}, block_size={block_size}"
+        return s.format(**self.__dict__)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
new file mode 100644
index 0000000000..3a74ad3c4c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PQEmbedding(nn.Module):
+    """
+    Quantized counterpart of nn.Embedding module. Stores the centroids and
+    the assignments. The full weight is re-instantiated at each forward
+    pass.
+
+    Args:
+        - centroids: centroids of size n_centroids x block_size
+        - assignments: assignments of the centroids to the subvectors
+          of size self.out_features x n_blocks
+        - bias: the non-quantized bias
+
+    Remarks:
+        - We refer the reader to the official documentation of the nn.Embedding module
+          for the other arguments and the behavior of the module
+        - Performance tests on GPU show that this implementation is 10% slower than
+          the non-quantized nn.Embedding module for a standard training loop.
+    """
+
+    def __init__(
+        self,
+        centroids,
+        assignments,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+    ):
+        super(PQEmbedding, self).__init__()
+        self.block_size = centroids.size(1)
+        self.n_centroids = centroids.size(0)
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        # check compatibility
+        if self.embedding_dim % self.block_size != 0:
+            raise ValueError("Wrong PQ sizes")
+        if len(assignments) % self.num_embeddings != 0:
+            raise ValueError("Wrong PQ sizes")
+        # define parameters
+        self.centroids = nn.Parameter(centroids, requires_grad=True)
+        self.register_buffer("assignments", assignments)
+        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
+
+    @property
+    def weight(self):
+        return (
+            self.centroids[self.assignments]
+            .reshape(-1, self.num_embeddings, self.block_size)
+            .permute(1, 0, 2)
+            .flatten(1, 2)
+        )
+
+    def forward(self, input):
+        return F.embedding(
+            input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+
+    def extra_repr(self):
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        if self.sparse is not False:
+            s += ", sparse=True"
+        s += ", n_centroids={n_centroids}, block_size={block_size}"
+
+        return s.format(**self.__dict__)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
new file mode 100644
index 0000000000..9bdd25a868
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PQLinear(nn.Module):
+    """
+    Quantized counterpart of nn.Linear module. Stores the centroid, the assignments
+    and the non-quantized biases. The full weight is re-instantiated at each forward
+    pass.
+
+    Args:
+        - centroids: centroids of size n_centroids x block_size
+        - assignments: assignments of the centroids to the subvectors
+          of size self.out_features x n_blocks
+        - bias: the non-quantized bias
+
+    Remarks:
+        - We refer the reader to the official documentation of the nn.Linear module
+          for the other arguments and the behavior of the module
+        - Performance tests on GPU show that this implementation is 15% slower than
+          the non-quantized nn.Linear module for a standard training loop.
+    """
+
+    def __init__(self, centroids, assignments, bias, in_features, out_features):
+        super(PQLinear, self).__init__()
+        self.block_size = centroids.size(1)
+        self.n_centroids = centroids.size(0)
+        self.in_features = in_features
+        self.out_features = out_features
+        # check compatibility
+        if self.in_features % self.block_size != 0:
+            raise ValueError("Wrong PQ sizes")
+        if len(assignments) % self.out_features != 0:
+            raise ValueError("Wrong PQ sizes")
+        # define parameters
+        self.centroids = nn.Parameter(centroids, requires_grad=True)
+        self.register_buffer("assignments", assignments)
+        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
+        if bias is not None:
+            self.bias = nn.Parameter(bias)
+        else:
+            self.register_parameter("bias", None)
+
+    @property
+    def weight(self):
+        return (
+            self.centroids[self.assignments]
+            .reshape(-1, self.out_features, self.block_size)
+            .permute(1, 0, 2)
+            .flatten(1, 2)
+        )
+
+    def forward(self, x):
+        return F.linear(
+            x,
+            self.weight,
+            self.bias,
+        )
+
+    def extra_repr(self):
+        return f"in_features={self.in_features},\
+                 out_features={self.out_features},\
+                 n_centroids={self.n_centroids},\
+                 block_size={self.block_size},\
+                 bias={self.bias is not None}"
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
new file mode 100644
index 0000000000..eddc2eb346
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
@@ -0,0 +1,128 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .em import EM, EmptyClusterResolveError
+
+
+class PQ(EM):
+    """
+    Quantizes the layer weights W with the standard Product Quantization
+    technique. This learns a codebook of codewords or centroids of size
+    block_size from W. For further reference on using PQ to quantize
+    neural networks, see "And the Bit Goes Down: Revisiting the Quantization
+    of Neural Networks", Stock et al., ICLR 2020.
+
+    PQ is performed in two steps:
+    (1) The matrix W (weights or fully-connected or convolutional layer)
+        is reshaped to (block_size, -1).
+            - If W is fully-connected (2D), its columns are split into
+              blocks of size block_size.
+            - If W is convolutional (4D), its filters are split along the
+              spatial dimension.
+    (2) We apply the standard EM/k-means algorithm to the resulting reshaped matrix.
+
+    Args:
+        - W: weight matrix to quantize of size (in_features x out_features)
+        - block_size: size of the blocks (subvectors)
+        - n_centroids: number of centroids
+        - n_iter: number of k-means iterations
+        - eps: for cluster reassignment when an empty cluster is found
+        - max_tentatives for cluster reassignment when an empty cluster is found
+        - verbose: print information after each iteration
+
+    Remarks:
+        - block_size be compatible with the shape of W
+    """
+
+    def __init__(
+        self,
+        W,
+        block_size,
+        n_centroids=256,
+        n_iter=20,
+        eps=1e-6,
+        max_tentatives=30,
+        verbose=True,
+    ):
+        self.block_size = block_size
+        W_reshaped = self._reshape(W)
+        super(PQ, self).__init__(
+            W_reshaped,
+            n_centroids=n_centroids,
+            n_iter=n_iter,
+            eps=eps,
+            max_tentatives=max_tentatives,
+            verbose=verbose,
+        )
+
+    def _reshape(self, W):
+        """
+        Reshapes the matrix W as expained in step (1).
+        """
+
+        # fully connected: by convention the weight has size out_features x in_features
+        if len(W.size()) == 2:
+            self.out_features, self.in_features = W.size()
+            assert (
+                self.in_features % self.block_size == 0
+            ), "Linear: n_blocks must be a multiple of in_features"
+            return (
+                W.reshape(self.out_features, -1, self.block_size)
+                .permute(2, 1, 0)
+                .flatten(1, 2)
+            )
+
+        # convolutional: we reshape along the spatial dimension
+        elif len(W.size()) == 4:
+            self.out_channels, self.in_channels, self.k_h, self.k_w = W.size()
+            assert (
+                self.in_channels * self.k_h * self.k_w
+            ) % self.block_size == 0, (
+                "Conv2d: n_blocks must be a multiple of in_channels * k_h * k_w"
+            )
+            return (
+                W.reshape(self.out_channels, -1, self.block_size)
+                .permute(2, 1, 0)
+                .flatten(1, 2)
+            )
+        # not implemented
+        else:
+            raise NotImplementedError(W.size())
+
+    def encode(self):
+        """
+        Performs self.n_iter EM steps.
+        """
+
+        self.initialize_centroids()
+        for i in range(self.n_iter):
+            try:
+                self.step(i)
+            except EmptyClusterResolveError:
+                break
+
+    def decode(self):
+        """
+        Returns the encoded full weight matrix. Must be called after
+        the encode function.
+        """
+
+        # fully connected case
+        if "k_h" not in self.__dict__:
+            return (
+                self.centroids[self.assignments]
+                .reshape(-1, self.out_features, self.block_size)
+                .permute(1, 0, 2)
+                .flatten(1, 2)
+            )
+
+        # convolutional case
+        else:
+            return (
+                self.centroids[self.assignments]
+                .reshape(-1, self.out_channels, self.block_size)
+                .permute(1, 0, 2)
+                .reshape(self.out_channels, self.in_channels, self.k_h, self.k_w)
+            )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
new file mode 100644
index 0000000000..eceeef8ba3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
@@ -0,0 +1,376 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import re
+from operator import attrgetter, itemgetter
+import torch
+import numpy as np
+import torch.distributed as dist
+import torch.nn as nn
+
+from .modules import PQConv2d, PQEmbedding, PQLinear
+from .pq import PQ
+
+
+def quantize_model_(
+    model,
+    size_tracker,
+    layers_to_quantize,
+    block_sizes_config,
+    n_centroids_config,
+    step=0,
+    n_iter=15,
+    eps=1e-6,
+    max_tentatives=100,
+    remove_weights=False,
+    verbose=True,
+    state_dict=None,
+):
+    """
+    Quantize a model in-place by stages. All the targeted
+    layers are replaced by their quantized counterpart,
+    and the model is ready for the finetuning of the
+    centroids in a standard training loop (no modifications
+    required). Note that we do not quantize biases.
+
+    Args:
+        - model: a nn.Module
+        - size_tracker: useful for tracking quatization statistics
+        - layers_to_quantize: a list containing regexps for
+          filtering the layers to quantize at each stage according
+          to their name (as in model.named_parameters())
+        - block_sizes_config: dict like
+          {
+              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}),
+              'Linear': ('in_features', {'*': 8})
+          }
+          For instance, all conv2d layers with kernel size 3x3 have
+          a block size of 9 and all Linear layers are quantized with
+          a block size of 8, irrespective of their size.
+        - n_centroids_config: dict like
+          {
+              'Conv2d': ('kernel_size', {'*': 256}),
+              'Linear': ('in_features', {'*': 256})
+          }
+          For instance, all conv2d layers are quantized with 256 centroids
+        - step: the layers to quantize inplace corresponding
+          to layers_to_quantize[step]
+    """
+
+    quantized_layers = get_layers(
+        model, layers_to_quantize[step], remove_weights=remove_weights
+    )
+
+    for layer in quantized_layers:
+
+        # book-keeping
+        is_master_process = (not dist.is_initialized()) or (
+            dist.is_initialized() and dist.get_rank() == 0
+        )
+        verbose = verbose and is_master_process
+
+        # get block size and centroids
+        module = attrgetter(layer)(model)
+        block_size = get_param(module, layer, block_sizes_config)
+        n_centroids = get_param(module, layer, n_centroids_config)
+        if verbose:
+            logging.info(
+                f"Quantizing layer {layer} with block size {block_size} and {n_centroids} centroids"
+            )
+
+        # quantize layer
+        weight = module.weight.data.clone()
+        is_bias = "bias" in [x[0] for x in module.named_parameters()]
+        bias = module.bias.data.clone() if is_bias else None
+        quantizer = PQ(
+            weight,
+            block_size,
+            n_centroids=n_centroids,
+            n_iter=n_iter,
+            eps=eps,
+            max_tentatives=max_tentatives,
+            verbose=verbose,
+        )
+
+        # quantization performed on all GPUs with same seed
+        quantizer.encode()
+        centroids = quantizer.centroids.contiguous()
+        assignments = quantizer.assignments.contiguous()
+
+        # If n_iter = 0 and state_dict is provided, then
+        # we initialize random assignments and centroids to
+        # random values of the appropriate dimensions
+        # because the quantized model parameters will
+        # overwritten by the state_dict later on.
+        if n_iter == 0 and state_dict:
+            # Initialize random centroids of the correct size
+            centroids = torch.rand(centroids.size())
+            centroids.cuda()
+            # Get counts and assignment keys from layer in loaded checkpoint.
+            counts_key = layer + "." + "counts"
+            assignment_key = layer + "." + "assignments"
+            # Get number of different bins to include.
+            counts = list(state_dict[counts_key].shape)[0]
+            print(layer)
+            print(state_dict[counts_key])
+            print(counts)
+            # Initialize random assignments of the correct size
+            # with an appropriate number of bins.
+            num_assignments = list(state_dict[assignment_key].shape)[0]
+            num_extra = num_assignments - counts
+            print(num_assignments)
+            print(num_extra)
+            assignments_bins = torch.arange(counts)
+            assignments_rand = torch.randint(0, counts - 1, (num_extra,))
+            assignments = torch.cat((assignments_bins, assignments_rand), 0)
+            # assignments = assignments.type(torch.IntTensor)
+            assignments.cuda()
+            print("assignments")
+            print(assignments)
+
+        # broadcast results to make sure weights are up-to-date
+        if dist.is_initialized():
+            dist.broadcast(centroids, 0)
+            dist.broadcast(assignments, 0)
+
+        # instantiate the quantized counterpart
+        if isinstance(module, nn.Linear):
+            out_features, in_features = map(
+                lambda k: module.__dict__[k], ["out_features", "in_features"]
+            )
+            quantized_module = PQLinear(
+                centroids, assignments, bias, in_features, out_features
+            )
+        elif isinstance(module, nn.Embedding):
+            num_embeddings, embedding_dim = map(
+                lambda k: module.__dict__[k], ["num_embeddings", "embedding_dim"]
+            )
+            quantized_module = PQEmbedding(
+                centroids, assignments, num_embeddings, embedding_dim
+            )
+        elif isinstance(module, nn.Conv2d):
+            out_channels, in_channels, kernel_size = map(
+                lambda k: module.__dict__[k],
+                ["out_channels", "in_channels", "kernel_size"],
+            )
+            stride, padding, dilation, groups, padding_mode = map(
+                lambda k: module.__dict__[k],
+                ["stride", "padding", "dilation", "groups", "padding_mode"],
+            )
+
+            quantized_module = PQConv2d(
+                centroids,
+                assignments,
+                bias,
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                padding_mode=padding_mode,
+            )
+        else:
+            raise ValueError(f"Module {module} not yet supported for quantization")
+
+        # replace layer by its quantized counterpart
+        attrsetter(layer)(model, quantized_module)
+
+        # update statistics
+        size_tracker.update(weight, block_size, n_centroids)
+
+    # return name of quantized layers
+    return quantized_layers
+
+
+def get_layers(model, filter_regexp, remove_weights=False):
+    """
+    Filters out the layers according to a regexp. Note that
+    we omit biases.
+
+    Args:
+        - model: a nn.Module
+        - filter_regexp: a regexp to filter the layers to keep
+          according to their name in model.named_parameters().
+          For instance, the regexp:
+
+             down_layers\\.[123456]\\.(conv[12]|identity\\.conv))
+
+          is keeping blocks down_layers from 1 to 6, and inside
+          each block is keeping conv1, conv2 and identity.conv.
+
+    Remarks:
+        - We add (module\\.)? at the beginning of the regexp to
+          account for the possible use of nn.parallel.DataParallel
+    """
+
+    # get all parameter names
+    all_layers = map(itemgetter(0), model.named_parameters())
+
+    # remove biases
+    all_layers = filter(lambda x: "bias" not in x, all_layers)
+
+    # remove .weight in all other names (or .weight_orig is spectral norm)
+    all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers)
+    # remove weights indicates whether the weights extension should be removed, in addition to
+    # weight_orig and weight extension on names
+    if remove_weights:
+        all_layers = map(lambda x: x.replace(".weights", ""), all_layers)
+    all_layers = map(lambda x: x.replace(".weight", ""), all_layers)
+
+    # return filtered layers
+    filter_regexp = "(module\\.)?" + "(" + filter_regexp + ")"
+    r = re.compile(filter_regexp)
+
+    return list(filter(r.match, all_layers))
+
+
+def get_param(module, layer_name, param_config):
+    """
+    Given a quantization configuration, get the right parameter
+    for the module to be quantized.
+
+    Args:
+        - module: a nn.Module
+        - layer_name: the name of the layer
+        - param_config: a dict like
+          {
+              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}),
+              'Linear': ('in_features', {'*': 8})
+          }
+          For instance, all conv2d layers with kernel size 3x3 have
+          a block size of 9 and all Linear layers are quantized with
+          a block size of 8, irrespective of their size.
+
+    Remarks:
+        - if 'fuzzy_name' is passed as a parameter, layers whose layer_name
+          include 'fuzzy_name' will be assigned the given parameter.
+          In the following example, conv.expand layers will have a block
+          size of 9 while conv.reduce will have a block size of 4 and all
+          other layers will have a block size of 2.
+          {
+              'Conv2d': ('fuzzy_name', {'expand': 9, 'reduce': 4, '*': 2}),
+              'Linear': ('fuzzy_name', {'classifier': 8, 'projection': 4})
+          }
+
+    """
+
+    layer_type = module.__class__.__name__
+
+    if layer_type not in param_config:
+        raise KeyError(f"Layer type {layer_type} not in config for layer {module}")
+
+    feature, params = param_config[module.__class__.__name__]
+
+    if feature != "fuzzy_name":
+        feature_value = str(getattr(module, feature))
+        if feature_value not in params:
+            if "*" in params:
+                feature_value = "*"
+            else:
+                raise KeyError(
+                    f"{feature}={feature_value} not in config for layer {module}"
+                )
+    else:
+        feature_values = [name for name in params if name in layer_name]
+        if len(feature_values) == 0:
+            if "*" in params:
+                feature_value = "*"
+            else:
+                raise KeyError(f"name={layer_name} not in config for {module}")
+        else:
+            feature_value = feature_values[0]
+
+    return params[feature_value]
+
+
+class SizeTracker(object):
+    """
+    Class to keep track of the compressed network size with iPQ.
+
+    Args:
+        - model: a nn.Module
+
+    Remarks:
+        - The compressed size is the sum of three components
+          for each layer in the network:
+              (1) Storing the centroids given by iPQ in fp16
+              (2) Storing the assignments of the blocks in int8
+              (3) Storing all non-compressed elements such as biases
+        - This cost in only valid if we use 256 centroids (then
+          indexing can indeed by done with int8).
+    """
+
+    def __init__(self, model):
+        self.model = model
+        self.size_non_compressed_model = self.compute_size()
+        self.size_non_quantized = self.size_non_compressed_model
+        self.size_index = 0
+        self.size_centroids = 0
+        self.n_quantized_layers = 0
+
+    def compute_size(self):
+        """
+        Computes the size of the model (in MB).
+        """
+
+        res = 0
+        for _, p in self.model.named_parameters():
+            res += p.numel()
+        return res * 4 / 1024 / 1024
+
+    def update(self, W, block_size, n_centroids):
+        """
+        Updates the running statistics when quantizing a new layer.
+        """
+
+        # bits per weights
+        bits_per_weight = np.log2(n_centroids) / block_size
+        self.n_quantized_layers += 1
+
+        # size of indexing the subvectors of size block_size (in MB)
+        size_index_layer = bits_per_weight * W.numel() / 8 / 1024 / 1024
+        self.size_index += size_index_layer
+
+        # size of the centroids stored in float16 (in MB)
+        size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024
+        self.size_centroids += size_centroids_layer
+
+        # size of non-compressed layers, e.g. LayerNorms or biases (in MB)
+        size_uncompressed_layer = W.numel() * 4 / 1024 / 1024
+        self.size_non_quantized -= size_uncompressed_layer
+
+    def __repr__(self):
+        size_compressed = (
+            self.size_index + self.size_centroids + self.size_non_quantized
+        )
+        compression_ratio = self.size_non_compressed_model / size_compressed  # NOQA
+        return (
+            f"Non-compressed model size: {self.size_non_compressed_model:.2f} MB. "
+            f"After quantizing {self.n_quantized_layers} layers, size "
+            f"(indexing + centroids + other): {self.size_index:.2f} MB + "
+            f"{self.size_centroids:.2f} MB + {self.size_non_quantized:.2f} MB = "
+            f"{size_compressed:.2f} MB, compression ratio: {compression_ratio:.2f}x"
+        )
+
+
+def attrsetter(*items):
+    def resolve_attr(obj, attr):
+        attrs = attr.split(".")
+        head = attrs[:-1]
+        tail = attrs[-1]
+
+        for name in head:
+            obj = getattr(obj, name)
+        return obj, tail
+
+    def g(obj, val):
+        for attr in items:
+            resolved_obj, resolved_attr = resolve_attr(obj, attr)
+            setattr(resolved_obj, resolved_attr, val)
+
+    return g
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
new file mode 100644
index 0000000000..b46d682c0e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def parse_config_yaml(yaml_data):
+    # Initialize to default options.
+    quantization_options = {
+        "n_centroids": {
+            "Linear": ["in_features", {"*": 256}],
+            "Embedding": ["embedding_dim", {"*": 256}],
+        },
+        "block_sizes": {
+            "Linear": ["fuzzy_name", {"fc": 8, "attn": 4, "emb": 4}],
+            "Embedding": ["fuzzy_name", {"emb": 8}],
+        },
+        "layers_to_quantize": [
+            "decoder\\.layers\\.\\d+\\.fc[12]",
+            "decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]",
+            "decoder\\.layers\\.\\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)",
+        ],
+    }
+
+    if "n_centroids" in yaml_data:
+        quantization_options["n_centroids"] = {
+            layer: convert_yaml_to_tuple(layer_data)
+            for layer, layer_data in yaml_data["n_centroids"].items()
+        }
+    if "block_sizes" in yaml_data:
+        quantization_options["block_sizes"] = {
+            layer: convert_yaml_to_tuple(layer_data)
+            for layer, layer_data in yaml_data["block_sizes"].items()
+        }
+    if "layers_to_quantize" in yaml_data:
+        quantization_options["layers_to_quantize"] = yaml_data["layers_to_quantize"]
+
+    return quantization_options
+
+
+def convert_yaml_to_tuple(yaml_dictionary):
+    """Converts a yaml dictionary with two keys: `key` and `value` into a two
+    argument tuple of those values."""
+    return (yaml_dictionary["key"], yaml_dictionary["value"])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
new file mode 100644
index 0000000000..143834f3d0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .utils import quantize_model_  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
new file mode 100644
index 0000000000..8031d9cdb2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qact import ActivationQuantizer  # NOQA
+from .qconv import IntConv2d  # NOQA
+from .qemb import IntEmbedding  # NOQA
+from .qlinear import IntLinear  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
new file mode 100644
index 0000000000..b362c30dc7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..ops import emulate_int
+
+
+class ActivationQuantizer:
+    """
+    Fake scalar quantization of the activations using a forward hook.
+
+    Args:
+        - module. a nn.Module for which we quantize the *post-activations*
+        - p: proportion of activations to quantize, set by default to 1
+        - update_step: to recompute quantization parameters
+        - bits: number of bits for quantization
+        - method: choose among {"tensor", "histogram", "channel"}
+        - clamp_threshold: to prevent gradients overflow
+
+    Remarks:
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - For the list of quantization methods and number of bits, see ops.py
+        - To remove the hook from the module, simply call self.handle.remove()
+        - At test time, the activations are fully quantized
+        - We use the straight-through estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick
+        - The activations are hard-clamped in [-clamp_threshold, clamp_threshold]
+          to prevent overflow during the backward pass
+    """
+
+    def __init__(
+        self,
+        module,
+        p=1,
+        update_step=1000,
+        bits=8,
+        method="histogram",
+        clamp_threshold=5,
+    ):
+        self.module = module
+        self.p = p
+        self.update_step = update_step
+        self.counter = 0
+        self.bits = bits
+        self.method = method
+        self.clamp_threshold = clamp_threshold
+        self.handle = None
+        self.register_hook()
+
+    def register_hook(self):
+        # forward hook
+        def quantize_hook(module, x, y):
+
+            # update parameters every 1000 iterations
+            if self.counter % self.update_step == 0:
+                self.scale = None
+                self.zero_point = None
+            self.counter += 1
+
+            # train with QuantNoise and evaluate the fully quantized network
+            p = self.p if self.module.training else 1
+
+            # quantize activations
+            y_q, self.scale, self.zero_point = emulate_int(
+                y.detach(),
+                bits=self.bits,
+                method=self.method,
+                scale=self.scale,
+                zero_point=self.zero_point,
+            )
+
+            # mask to apply noise
+            mask = torch.zeros_like(y)
+            mask.bernoulli_(1 - p)
+            noise = (y_q - y).masked_fill(mask.bool(), 0)
+
+            # using straight-through estimator (STE)
+            clamp_low = -self.scale * self.zero_point
+            clamp_high = self.scale * (2**self.bits - 1 - self.zero_point)
+            return torch.clamp(y, clamp_low.item(), clamp_high.item()) + noise.detach()
+
+        # register hook
+        self.handle = self.module.register_forward_hook(quantize_hook)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
new file mode 100644
index 0000000000..29744744ec
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from torch.nn.modules.conv import _ConvNd
+from torch.nn.modules.utils import _pair
+
+from ..ops import emulate_int
+
+
+class IntConv2d(_ConvNd):
+    """
+    Quantized counterpart of the nn.Conv2d module that applies QuantNoise during training.
+
+    Args:
+        - standard nn.Conv2d parameters
+        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
+        - bits: number of bits
+        - method: choose among {"tensor", "histogram", "channel"}
+        - update_step: recompute scale and zero_point every update_steps iterations
+
+    Remarks:
+        - We use the straight-thgourh estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - At test time, the weights are fully quantized
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        p=0,
+        bits=8,
+        method="histogram",
+        update_step=1000,
+    ):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        super(IntConv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _pair(0),
+            groups,
+            bias,
+            padding_mode,
+        )
+
+        # quantization parameters
+        self.p = p
+        self.bits = bits
+        self.method = method
+        self.update_step = update_step
+        self.counter = 0
+
+    def _conv_forward(self, input, weight):
+        if self.padding_mode != "zeros":
+            return F.conv2d(
+                F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
+                weight,
+                self.bias,
+                self.stride,
+                _pair(0),
+                self.dilation,
+                self.groups,
+            )
+        return F.conv2d(
+            input,
+            weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+        )
+
+    def forward(self, input):
+        # train with QuantNoise and evaluate the fully quantized network
+        p = self.p if self.training else 1
+
+        # update parameters every 100 iterations
+        if self.counter % self.update_step == 0:
+            self.scale = None
+            self.zero_point = None
+        self.counter += 1
+
+        # quantize weight
+        weight_quantized, self.scale, self.zero_point = emulate_int(
+            self.weight.detach(),
+            bits=self.bits,
+            method=self.method,
+            scale=self.scale,
+            zero_point=self.zero_point,
+        )
+
+        # mask to apply noise
+        mask = torch.zeros_like(self.weight)
+        mask.bernoulli_(1 - p)
+        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
+
+        # using straight-through estimator (STE)
+        clamp_low = -self.scale * self.zero_point
+        clamp_high = self.scale * (2**self.bits - 1 - self.zero_point)
+        weight = (
+            torch.clamp(self.weight, clamp_low.item(), clamp_high.item())
+            + noise.detach()
+        )
+
+        # return output
+        output = self._conv_forward(input, weight)
+        return output
+
+    def extra_repr(self):
+        return (
+            "in_channels={}, out_channels={}, kernel_size={}, stride={}, "
+            "padding={}, dilation={}, groups={}, bias={}, quant_noise={}, "
+            "bits={}, method={}".format(
+                self.in_channels,
+                self.out_channels,
+                self.kernel_size,
+                self.stride,
+                self.padding,
+                self.dilation,
+                self.groups,
+                self.bias is not None,
+                self.p,
+                self.bits,
+                self.method,
+            )
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
new file mode 100644
index 0000000000..3b293ac31e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
@@ -0,0 +1,147 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..ops import emulate_int
+
+
+class IntEmbedding(nn.Module):
+    """
+    Quantized counterpart of the nn.Embedding module that applies QuantNoise during training.
+
+    Args:
+        - num_embeddings: number of tokens
+        - embedding_dim: embedding dimension
+        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
+        - bits: number of bits
+        - method: choose among {"tensor", "histogram", "channel"}
+        - update_step: recompute scale and zero_point every update_steps iterations
+
+    Remarks:
+        - We use the straight-through estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - At test time, the weights are fully quantized
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        max_norm=None,
+        norm_type=2.0,
+        scale_grad_by_freq=False,
+        sparse=False,
+        _weight=None,
+        p=0,
+        update_step=1000,
+        bits=8,
+        method="histogram",
+    ):
+        super(IntEmbedding, self).__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim))
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [
+                num_embeddings,
+                embedding_dim,
+            ], "Shape of weight does not match num_embeddings and embedding_dim"
+            self.weight = nn.Parameter(_weight)
+        self.sparse = sparse
+
+        # quantization parameters
+        self.p = p
+        self.bits = bits
+        self.method = method
+        self.update_step = update_step
+        self.counter = 0
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input):
+        # train with QuantNoise and evaluate the fully quantized network
+        p = self.p if self.training else 1
+
+        # update parameters every 1000 iterations
+        if self.counter % self.update_step == 0:
+            self.scale = None
+            self.zero_point = None
+        self.counter += 1
+
+        # quantize weight
+        weight_quantized, self.scale, self.zero_point = emulate_int(
+            self.weight.detach(),
+            bits=self.bits,
+            method=self.method,
+            scale=self.scale,
+            zero_point=self.zero_point,
+        )
+
+        # mask to apply noise
+        mask = torch.zeros_like(self.weight)
+        mask.bernoulli_(1 - p)
+        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
+
+        # using straight-through estimator (STE)
+        clamp_low = -self.scale * self.zero_point
+        clamp_high = self.scale * (2**self.bits - 1 - self.zero_point)
+        weight = (
+            torch.clamp(self.weight, clamp_low.item(), clamp_high.item())
+            + noise.detach()
+        )
+
+        # return output
+        output = F.embedding(
+            input,
+            weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        return output
+
+    def extra_repr(self):
+        s = "{num_embeddings}, {embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        if self.max_norm is not None:
+            s += ", max_norm={max_norm}"
+        if self.norm_type != 2:
+            s += ", norm_type={norm_type}"
+        if self.scale_grad_by_freq is not False:
+            s += ", scale_grad_by_freq={scale_grad_by_freq}"
+        if self.sparse is not False:
+            s += ", sparse=True"
+        s += "quant_noise={p}, bits={bits}, method={method}"
+        return s.format(**self.__dict__)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
new file mode 100644
index 0000000000..78606a25b9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
@@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..ops import emulate_int
+
+
+class IntLinear(nn.Module):
+    """
+    Quantized counterpart of the nn.Linear module that applies QuantNoise during training.
+
+    Args:
+        - in_features: input features
+        - out_features: output features
+        - bias: bias or not
+        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
+        - bits: number of bits
+        - method: choose among {"tensor", "histogram", "channel"}
+        - update_step: recompute scale and zero_point every update_steps iterations
+
+    Remarks:
+        - We use the straight-through estimator so that the gradients
+          back-propagate nicely in the network, this is implemented with
+          the detach() trick.
+        - Parameters scale and zero_point are recomputed every update_step
+          forward pass to reduce the overhead
+        - At test time, the weights are fully quantized
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        p=0,
+        update_step=3000,
+        bits=8,
+        method="histogram",
+    ):
+        super(IntLinear, self).__init__()
+        self.in_features = int(in_features)
+        self.out_features = int(out_features)
+        self.weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
+        self.chosen_bias = bias
+        if self.chosen_bias:
+            self.bias = torch.nn.Parameter(torch.Tensor(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+        # quantization parameters
+        self.p = p
+        self.bits = bits
+        self.method = method
+        self.update_step = update_step
+        self.counter = 0
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.weight)
+        if self.chosen_bias:
+            nn.init.constant_(self.bias, 0.0)
+        return
+
+    def forward(self, input):
+        # train with QuantNoise and evaluate the fully quantized network
+        p = self.p if self.training else 1
+
+        # update parameters every 100 iterations
+        if self.counter % self.update_step == 0:
+            self.scale = None
+            self.zero_point = None
+        self.counter += 1
+
+        # quantize weight
+        weight_quantized, self.scale, self.zero_point = emulate_int(
+            self.weight.detach(),
+            bits=self.bits,
+            method=self.method,
+            scale=self.scale,
+            zero_point=self.zero_point,
+        )
+
+        # mask to apply noise
+        mask = torch.zeros_like(self.weight)
+        mask.bernoulli_(1 - p)
+        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
+
+        # using straight-through estimator (STE)
+        clamp_low = -self.scale * self.zero_point
+        clamp_high = self.scale * (2**self.bits - 1 - self.zero_point)
+        weight = (
+            torch.clamp(self.weight, clamp_low.item(), clamp_high.item())
+            + noise.detach()
+        )
+
+        # return output
+        output = F.linear(input, weight, self.bias)
+        return output
+
+    def extra_repr(self):
+        return "in_features={}, out_features={}, bias={}, quant_noise={}, bits={}, method={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.p,
+            self.bits,
+            self.method,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
new file mode 100644
index 0000000000..e0f9a0c1f8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
@@ -0,0 +1,59 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+try:
+    import torch.ao.quantization as quantization
+except ImportError:
+    import torch.quantization as quantization
+
+
+def emulate_int(w, bits, method, scale=None, zero_point=None):
+    q = globals()[f"emulate_int8_{method}"]
+    return q(w, scale=scale, zero_point=zero_point, bits=bits)
+
+
+def quantize(w, scale, zero_point, bits=8):
+    # In the default behavior, max_val = 255.
+    max_val = 2**bits - 1
+    return (
+        torch.clamp(torch.round(w / scale + zero_point), 0, max_val) - zero_point
+    ) * scale
+
+
+def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8):
+    if scale is None:
+        obs = quantization.observer.HistogramObserver()
+        obs.to(device=w.device)
+        _ = obs(w.float())
+        scale, zero_point = obs.calculate_qparams()
+        scale = scale.cuda().type_as(w)
+        zero_point = zero_point.cuda().type_as(w)
+    return quantize(w, scale, zero_point, bits=bits), scale, zero_point
+
+
+def emulate_int8_channel(w, scale=None, zero_point=None, bits=8):
+    if scale is None:
+        obs = quantization.observer.PerChannelMinMaxObserver(
+            ch_axis=-1, qscheme=torch.per_channel_symmetric
+        )
+        obs.to(device=w.device)
+        _ = obs(w)
+        scale, zero_point, ch_axis = obs.get_qparams()
+        scale = scale.cuda().type_as(w)
+        zero_point = zero_point.cuda().type_as(w)
+    return quantize(w, scale, zero_point, bits=bits), scale, zero_point
+
+
+def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8):
+    if scale is None:
+        obs = quantization.observer.MinMaxObserver()
+        obs.to(device=w.device)
+        _ = obs(w)
+        scale, zero_point = obs.calculate_qparams()
+        scale = scale.cuda().type_as(w)
+        zero_point = zero_point.cuda().type_as(w)
+    return quantize(w, scale, zero_point, bits=bits), scale, zero_point
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
new file mode 100644
index 0000000000..d4b1cc255b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from operator import attrgetter
+
+import torch.distributed as dist
+import torch.nn as nn
+
+from ..pq.utils import attrsetter, get_layers
+from .modules import ActivationQuantizer, IntConv2d, IntEmbedding, IntLinear
+
+
+MAPPING = {nn.Linear: IntLinear, nn.Embedding: IntEmbedding, nn.Conv2d: IntConv2d}
+
+
+def quantize_model_(
+    model, p=0.2, bits=8, update_step=3000, method="histogram", remove_weights=False
+):
+    """
+    Replaces all modules with their scalar quantized counterpart and
+    registers hooks to quantize the post-ativations of those modules.
+
+    Args:
+        - model: a nn.Module
+        - p: amount of noise (0 for no noise, 1 to quantize all the weights/activations)
+        - bits: number of bits
+        - update_step: update quantization parameters every update_step steps
+    """
+    # quantize all layers
+    # remove weights indicates whether the weights extension should be removed, in addition to
+    # weight_orig and weight extension on names
+    quantized_layers = get_layers(model, "(.*?)", remove_weights=remove_weights)
+
+    for layer in quantized_layers:
+
+        # book-keeping
+        is_master_process = (not dist.is_initialized()) or (
+            dist.is_initialized() and dist.get_rank() == 0
+        )
+
+        # recover module
+        module = attrgetter(layer)(model)
+        if is_master_process:
+            logging.info(
+                f"Quantizing layer {layer} with bits={bits} and QuantNoise={p}"
+            )
+
+        # quantization params
+        q_params = {
+            "p": p,
+            "update_step": update_step,
+            "bits": bits,
+            "method": method,
+            "counter": 0,
+        }
+
+        # instantiate the quantized counterpart
+        if isinstance(module, tuple(MAPPING.keys())):
+            QuantizedModule = MAPPING[module.__class__]
+            quantized_module = QuantizedModule.__new__(QuantizedModule)
+            params = module.__dict__
+            params.update(q_params)
+            quantized_module.__dict__.update(params)
+
+        else:
+            if is_master_process:
+                logging.info(f"Module {module} not yet supported for quantization")
+            continue
+
+        # activation quantization
+        a_q = ActivationQuantizer(quantized_module, p=0, bits=bits, method=method)
+
+        # replace layer by its quantized counterpart
+        attrsetter(layer)(model, quantized_module)
+
+    # return name of quantized layers
+    return quantized_layers
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py
new file mode 100644
index 0000000000..84b88984ea
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py
@@ -0,0 +1,51 @@
+import torch
+
+
+class RotaryPositionalEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half):
+        """Rotary positional embedding
+        Reference : https://blog.eleuther.ai/rotary-embeddings/
+        Paper: https://arxiv.org/pdf/2104.09864.pdf
+        Args:
+            dim: Dimension of embedding
+            base: Base value for exponential
+            precision: precision to use for numerical values
+        """
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+        self.precision = precision
+
+    def forward(self, x, seq_len=None):
+        """
+        Args:
+            x: Input x with T X B X C
+            seq_len: Sequence length of input x
+        """
+        if seq_len != self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.cos_cached = emb.cos()[:, None, None, :]
+            self.sin_cached = emb.sin()[:, None, None, :]
+        return self.cos_cached, self.sin_cached
+
+
+# rotary pos emb helpers:
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat(
+        (-x2, x1), dim=x1.ndim - 1
+    )  # dim=-1 triggers a bug in earlier torch versions
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
+    cos, sin = (
+        cos[offset : q.shape[0] + offset, ...],
+        sin[offset : q.shape[0] + offset, ...],
+    )
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
new file mode 100644
index 0000000000..a3ce4131c6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from torch import nn
+
+
+class SamePad(nn.Module):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal:
+            self.remove = kernel_size - 1
+        else:
+            self.remove = 1 if kernel_size % 2 == 0 else 0
+
+    def forward(self, x):
+        if self.remove > 0:
+            x = x[:, :, : -self.remove]
+        return x
+
+
+class SamePad2d(nn.Module):
+    def __init__(self, kernel_size):
+        super().__init__()
+        self.remove = 1 if kernel_size % 2 == 0 else 0
+
+    def forward(self, x):
+        assert len(x.size()) == 4
+        if self.remove > 0:
+            x = x[:, :, : -self.remove, : -self.remove]
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
new file mode 100644
index 0000000000..c96247c759
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import torch
+
+
+class ScalarBias(torch.autograd.Function):
+    """
+    Adds a vector of scalars, used in self-attention mechanism to allow
+    the model to optionally attend to this vector instead of the past
+    """
+
+    @staticmethod
+    def forward(ctx, input, dim, bias_init):
+        size = list(input.size())
+        size[dim] += 1
+        output = input.new(*size).fill_(bias_init)
+        output.narrow(dim, 1, size[dim] - 1).copy_(input)
+        ctx.dim = dim
+        return output
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None
+
+
+def scalar_bias(input, dim, bias_init=0):
+    return ScalarBias.apply(input, dim, bias_init)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
new file mode 100644
index 0000000000..4793ecfb52
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, Optional
+
+import torch
+import torch.onnx.operators
+from fairseq import utils
+from torch import Tensor, nn
+
+
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+
+    Padding symbols are ignored.
+    """
+
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx if padding_idx is not None else 0
+        self.weights = SinusoidalPositionalEmbedding.get_embedding(
+            init_size, embedding_dim, padding_idx
+        )
+        self.onnx_trace = False
+        self.register_buffer("_float_tensor", torch.FloatTensor(1))
+        self.max_positions = int(1e5)
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    @staticmethod
+    def get_embedding(
+        num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
+        """Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+
+    def forward(
+        self,
+        input,
+        incremental_state: Optional[Any] = None,
+        timestep: Optional[Tensor] = None,
+        positions: Optional[Any] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bspair = torch.onnx.operators.shape_as_tensor(input)
+        bsz, seq_len = bspair[0], bspair[1]
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            # recompute/expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos, self.embedding_dim, self.padding_idx
+            )
+        self.weights = self.weights.to(self._float_tensor)
+
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            if self.onnx_trace:
+                return (
+                    self.weights.index_select(index=self.padding_idx + pos, dim=0)
+                    .unsqueeze(1)
+                    .repeat(bsz, 1, 1)
+                )
+            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
+
+        positions = utils.make_positions(
+            input, self.padding_idx, onnx_trace=self.onnx_trace
+        )
+        if self.onnx_trace:
+            flat_embeddings = self.weights.detach().index_select(0, positions.view(-1))
+            embedding_shape = torch.cat(
+                (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long))
+            )
+            embeddings = torch.onnx.operators.reshape_from_tensor_shape(
+                flat_embeddings, embedding_shape
+            )
+            return embeddings
+        return (
+            self.weights.index_select(0, positions.view(-1))
+            .view(bsz, seq_len, -1)
+            .detach()
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
new file mode 100644
index 0000000000..3cbd9d6785
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+
+from .multihead_attention import MultiheadAttention
+
+
+class SparseMultiheadAttention(MultiheadAttention):
+    """Sparse Multi-Headed Attention.
+
+    "Generating Long Sequences with Sparse Transformers". Implements
+    fixed factorized self attention, where l=stride and c=expressivity.
+    A(1) includes all words in the stride window and A(2) takes a summary of c
+    words from the end of each stride window.
+    If is_bidirectional=False, we do not include any words past the current word,
+    as in the paper.
+    """
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        stride=32,
+        expressivity=8,
+        is_bidirectional=True,
+    ):
+
+        super().__init__(
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            self_attention,
+            encoder_decoder_attention,
+        )
+
+        self.is_bidirectional = is_bidirectional
+        self.stride = stride
+        self.expressivity = expressivity
+        assert self.stride > 0 and self.stride >= self.expressivity
+
+    # Used for Ai(2) calculations - beginning of [l-c, l] range
+    def compute_checkpoint(self, word_index):
+        if word_index % self.stride == 0 and word_index != 0:
+            checkpoint_index = word_index - self.expressivity
+        else:
+            checkpoint_index = (
+                math.floor(word_index / self.stride) * self.stride
+                + self.stride
+                - self.expressivity
+            )
+        return checkpoint_index
+
+    # Computes Ai(2)
+    def compute_subset_summaries(self, absolute_max):
+        checkpoint_index = self.compute_checkpoint(0)
+        subset_two = set()
+        while checkpoint_index <= absolute_max - 1:
+            summary = set(
+                range(
+                    checkpoint_index,
+                    min(checkpoint_index + self.expressivity + 1, absolute_max),
+                )
+            )
+            subset_two = subset_two.union(summary)
+            checkpoint_index = self.compute_checkpoint(checkpoint_index + self.stride)
+        return subset_two
+
+    # Sparse Transformer Fixed Attention Pattern: https://arxiv.org/pdf/1904.10509.pdf
+    def compute_fixed_attention_subset(self, word_index, tgt_len):
+        # +1s account for range function; [min, max) -> [min, max]
+        if not self.is_bidirectional:
+            absolute_max = word_index + 1
+        else:
+            absolute_max = tgt_len
+
+        # Subset 1 - whole window
+        rounded_index = (
+            math.floor((word_index + self.stride) / self.stride) * self.stride
+        )
+        if word_index % self.stride == 0 and word_index != 0:
+            subset_one = set(
+                range(word_index - self.stride, min(absolute_max, word_index + 1))
+            )
+        else:
+            subset_one = set(
+                range(
+                    max(0, rounded_index - self.stride),
+                    min(absolute_max, rounded_index + 1),
+                )
+            )
+
+        # Subset 2 - summary per window
+        # If bidirectional, subset 2 is the same for every index
+        subset_two = set()
+        if not self.is_bidirectional:
+            subset_two = self.compute_subset_summaries(absolute_max)
+
+        return subset_one.union(subset_two)
+
+    # Compute sparse mask - if bidirectional, can pre-compute and store
+    def buffered_sparse_mask(self, tensor, tgt_len, src_len):
+        assert tgt_len > self.stride
+        sparse_mask = torch.empty((tgt_len, src_len)).float().fill_(float("-inf"))
+
+        # If bidirectional, subset 2 is the same for every index
+        subset_summaries = set()
+        if self.is_bidirectional:
+            subset_summaries = self.compute_subset_summaries(tgt_len)
+
+        for i in range(tgt_len):
+            fixed_attention_subset = self.compute_fixed_attention_subset(i, tgt_len)
+            fixed_attention_subset = fixed_attention_subset.union(subset_summaries)
+            included_word_indices = torch.LongTensor(list(fixed_attention_subset))
+            sparse_mask[i].index_fill_(0, included_word_indices, 0)
+        return sparse_mask.type_as(tensor)
+
+    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
+        sparse_mask = self.buffered_sparse_mask(attn_weights, tgt_len, src_len)
+        sparse_mask = sparse_mask.unsqueeze(0).expand(
+            bsz * self.num_heads, tgt_len, src_len
+        )
+        attn_weights += sparse_mask
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
new file mode 100644
index 0000000000..f41ec09327
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn as nn
+from fairseq.modules import TransformerSentenceEncoder
+from fairseq.modules.sparse_transformer_sentence_encoder_layer import (
+    SparseTransformerSentenceEncoderLayer,
+)
+
+
+class SparseTransformerSentenceEncoder(TransformerSentenceEncoder):
+    """
+    Sparse implementation of the TransformerSentenceEncoder
+    - see SparseMultiheadAttention
+    """
+
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        is_bidirectional: bool = True,
+        stride: int = 32,
+        expressivity: int = 8,
+    ) -> None:
+
+        super().__init__(
+            padding_idx,
+            vocab_size,
+            num_encoder_layers,
+            embedding_dim,
+            ffn_embedding_dim,
+            num_attention_heads,
+            dropout,
+            attention_dropout,
+            activation_dropout,
+            max_seq_len,
+            num_segments,
+            use_position_embeddings,
+            offset_positions_by_padding,
+            encoder_normalize_before,
+            apply_bert_init,
+            activation_fn,
+            learned_pos_embedding,
+            embed_scale,
+            freeze_embeddings,
+            n_trans_layers_to_freeze,
+            export,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                SparseTransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=ffn_embedding_dim,
+                    num_attention_heads=num_attention_heads,
+                    dropout=dropout,
+                    attention_dropout=attention_dropout,
+                    activation_dropout=activation_dropout,
+                    activation_fn=activation_fn,
+                    export=export,
+                    is_bidirectional=is_bidirectional,
+                    stride=stride,
+                    expressivity=expressivity,
+                )
+                for _ in range(num_encoder_layers)
+            ]
+        )
+
+        def freeze_module_params(m):
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+        for layer in range(n_trans_layers_to_freeze):
+            freeze_module_params(self.layers[layer])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000..d95da59c24
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.modules import TransformerSentenceEncoderLayer
+from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
+
+
+class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+    """
+    Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention)
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        export: bool = False,
+        is_bidirectional: bool = True,
+        stride: int = 32,
+        expressivity: int = 8,
+    ) -> None:
+
+        super().__init__(
+            embedding_dim,
+            ffn_embedding_dim,
+            num_attention_heads,
+            dropout,
+            attention_dropout,
+            activation_dropout,
+            activation_fn,
+            export,
+        )
+
+        self.self_attn = SparseMultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=True,
+            is_bidirectional=is_bidirectional,
+            stride=stride,
+            expressivity=expressivity,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
new file mode 100644
index 0000000000..19e035dec5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
@@ -0,0 +1,562 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import utils
+from fairseq.models.transformer import TransformerConfig
+from fairseq.modules import LayerNorm, MultiheadAttention
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+
+
+class TransformerEncoderLayerBase(nn.Module):
+    """Encoder layer block.
+
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *cfg.encoder.normalize_before* to ``True``.
+
+    Args:
+        cfg (argparse.Namespace): parsed command-line arguments
+    """
+
+    def __init__(self, cfg, return_fc=False):
+        super().__init__()
+        self.cfg = cfg
+        self.return_fc = return_fc
+        self.embed_dim = cfg.encoder.embed_dim
+        self.quant_noise = cfg.quant_noise.pq
+        self.quant_noise_block_size = cfg.quant_noise.pq_block_size
+        self.self_attn = self.build_self_attention(self.embed_dim, cfg)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
+        self.dropout_module = FairseqDropout(
+            cfg.dropout, module_name=self.__class__.__name__
+        )
+        self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn)
+        activation_dropout_p = cfg.activation_dropout
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use cfg.relu_dropout
+            activation_dropout_p = cfg.relu_dropout or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = cfg.encoder.normalize_before
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            cfg.encoder.ffn_embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            cfg.encoder.ffn_embed_dim,
+            self.embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(
+            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
+        )
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(
+            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
+        )
+
+    def _get_fc_rank(self, remove_num: int) -> List[int]:
+        f1_filter_param = []
+        for i in range(self.fc1.out_features):
+            f1_filter_param.append(
+                torch.sum(torch.abs(self.fc1.weight[i]))
+                + torch.sum(torch.abs(self.fc2.weight[:, i]))
+                + torch.abs(self.fc1.bias[i])
+            )
+        return sorted(
+            range(len(f1_filter_param)), key=lambda k: f1_filter_param[k], reverse=False
+        )[0:remove_num]
+
+    def _prune_fc_layer(self, remove_index: List[int]):
+        new_fc1_weight = []
+        new_fc1_bias = []
+        for i in range(self.fc1.out_features):
+            if i not in remove_index:
+                new_fc1_weight.append(self.fc1.weight[i])
+                new_fc1_bias.append(self.fc1.bias[i])
+
+        new_fc1_weight = torch.stack(new_fc1_weight).detach()
+        new_fc1_weight.requires_grad = True
+
+        new_fc1_bias = torch.stack(new_fc1_bias).detach()
+        new_fc1_bias.requires_grad = True
+
+        self.fc1 = quant_noise(
+            nn.Linear(self.fc1.in_features, self.fc1.out_features - len(remove_index)),
+            p=self.quant_noise,
+            block_size=self.quant_noise_block_size,
+        )
+        self.fc1.weight = torch.nn.Parameter(new_fc1_weight)
+        self.fc1.bias = torch.nn.Parameter(new_fc1_bias)
+
+        new_fc2_weight = []
+        new_fc2_bias = []
+        for i in range(self.fc2.in_features):
+            if i not in remove_index:
+                new_fc2_weight.append(self.fc2.weight[:, i])
+        new_fc2_bias = self.fc2.bias.detach()
+
+        new_fc2_weight = torch.stack(new_fc2_weight, dim=-1).detach()
+        new_fc2_weight.requires_grad = True
+
+        new_fc2_bias = self.fc2.bias.detach()
+        new_fc2_bias.requires_grad = True
+
+        self.fc2 = quant_noise(
+            nn.Linear(self.fc2.in_features - len(remove_index), self.fc2.out_features),
+            p=self.quant_noise,
+            block_size=self.quant_noise_block_size,
+        )
+        self.fc2.weight = torch.nn.Parameter(new_fc2_weight)
+        self.fc2.bias = torch.nn.Parameter(new_fc2_bias)
+
+    def build_self_attention(self, embed_dim, cfg):
+        return MultiheadAttention(
+            embed_dim,
+            cfg.encoder.attention_heads,
+            dropout=cfg.attention_dropout,
+            self_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            xformers_att_config=cfg.encoder.xformers_att_config,
+        )
+
+    def residual_connection(self, x, residual):
+        return residual + x
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
+        for old, new in layer_norm_map.items():
+            for m in ("weight", "bias"):
+                k = "{}.layer_norms.{}.{}".format(name, old, m)
+                if k in state_dict:
+                    state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
+                    del state_dict[k]
+
+    def forward(
+        self,
+        x,
+        encoder_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, seq_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
+                where `tgt_len` is the length of output and `src_len` is the
+                length of input, though here both are equal to `seq_len`.
+                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
+                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
+                useful for strided self-attention.
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(
+                attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
+            )
+
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask,
+            need_weights=False,
+            attn_mask=attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+
+        fc_result = x
+
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        if self.return_fc and not torch.jit.is_scripting():
+            return x, fc_result
+        return x
+
+
+# backward compatible with the legacy argparse format
+class TransformerEncoderLayer(TransformerEncoderLayerBase):
+    def __init__(self, args):
+        super().__init__(TransformerConfig.from_namespace(args))
+        self.args = args
+
+    def build_self_attention(self, embed_dim, args):
+        return super().build_self_attention(
+            embed_dim, TransformerConfig.from_namespace(args)
+        )
+
+
+class TransformerDecoderLayerBase(nn.Module):
+    """Decoder layer block.
+
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *cfg.decoder.normalize_before* to ``True``.
+
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+
+    def __init__(
+        self, cfg, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__()
+        self.embed_dim = cfg.decoder.embed_dim
+        self.dropout_module = FairseqDropout(
+            cfg.dropout, module_name=self.__class__.__name__
+        )
+        self.quant_noise = cfg.quant_noise.pq
+        self.quant_noise_block_size = cfg.quant_noise.pq_block_size
+
+        self.cross_self_attention = cfg.cross_self_attention
+
+        self.self_attn = self.build_self_attention(
+            self.embed_dim,
+            cfg,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+        self.attn_ln = (
+            LayerNorm(self.embed_dim)
+            if utils.safe_getattr(cfg, "scale_attn", False)
+            else None
+        )
+        self.nh = self.self_attn.num_heads
+        self.head_dim = self.self_attn.head_dim
+        scale_heads = utils.safe_getattr(cfg, "scale_heads", False)
+        self.c_attn = (
+            nn.Parameter(torch.ones((self.nh,)), requires_grad=True)
+            if scale_heads
+            else None
+        )
+
+        self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn)
+        activation_dropout_p = cfg.activation_dropout
+        if activation_dropout_p == 0:
+            # for backwards compatibility with models that use cfg.relu_dropout
+            activation_dropout_p = cfg.relu_dropout or 0
+        self.activation_dropout_module = FairseqDropout(
+            float(activation_dropout_p), module_name=self.__class__.__name__
+        )
+        self.normalize_before = cfg.decoder.normalize_before
+
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg)
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
+
+        self.ffn_layernorm = (
+            LayerNorm(cfg.decoder.ffn_embed_dim)
+            if utils.safe_getattr(cfg, "scale_fc", False)
+            else None
+        )
+        self.w_resid = (
+            nn.Parameter(
+                torch.ones(
+                    self.embed_dim,
+                ),
+                requires_grad=True,
+            )
+            if utils.safe_getattr(cfg, "scale_resids", False)
+            else None
+        )
+
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            cfg.decoder.ffn_embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            cfg.decoder.ffn_embed_dim,
+            self.embed_dim,
+            self.quant_noise,
+            self.quant_noise_block_size,
+        )
+
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
+        self.need_attn = True
+
+        self.onnx_trace = False
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_self_attention(
+        self, embed_dim, cfg, add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            cfg.decoder.attention_heads,
+            dropout=cfg.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=not cfg.cross_self_attention,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            xformers_att_config=cfg.decoder.xformers_att_config,
+        )
+
+    def build_encoder_attention(self, embed_dim, cfg):
+        return MultiheadAttention(
+            embed_dim,
+            cfg.decoder.attention_heads,
+            kdim=cfg.encoder.embed_dim,
+            vdim=cfg.encoder.embed_dim,
+            dropout=cfg.attention_dropout,
+            encoder_decoder_attention=True,
+            q_noise=self.quant_noise,
+            qn_block_size=self.quant_noise_block_size,
+            xformers_att_config=cfg.encoder.xformers_att_config,
+        )
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def residual_connection(self, x, residual):
+        return residual + x
+
+    def forward(
+        self,
+        x,
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        prev_self_attn_state: Optional[List[torch.Tensor]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        if need_head_weights:
+            need_attn = True
+
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        if prev_self_attn_state is not None:
+            prev_key, prev_value = prev_self_attn_state[:2]
+            saved_state: Dict[str, Optional[Tensor]] = {
+                "prev_key": prev_key,
+                "prev_value": prev_value,
+            }
+            if len(prev_self_attn_state) >= 3:
+                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+            assert incremental_state is not None
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
+        if self.cross_self_attention and not (
+            incremental_state is not None
+            and _self_attn_input_buffer is not None
+            and "prev_key" in _self_attn_input_buffer
+        ):
+            if self_attn_mask is not None:
+                assert encoder_out is not None
+                self_attn_mask = torch.cat(
+                    (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
+                )
+            if self_attn_padding_mask is not None:
+                if encoder_padding_mask is None:
+                    assert encoder_out is not None
+                    encoder_padding_mask = self_attn_padding_mask.new_zeros(
+                        encoder_out.size(1), encoder_out.size(0)
+                    )
+                self_attn_padding_mask = torch.cat(
+                    (encoder_padding_mask, self_attn_padding_mask), dim=1
+                )
+            assert encoder_out is not None
+            y = torch.cat((encoder_out, x), dim=0)
+        else:
+            y = x
+
+        x, attn = self.self_attn(
+            query=x,
+            key=y,
+            value=y,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        if self.c_attn is not None:
+            tgt_len, bsz = x.size(0), x.size(1)
+            x = x.view(tgt_len, bsz, self.nh, self.head_dim)
+            x = torch.einsum("tbhd,h->tbhd", x, self.c_attn)
+            x = x.reshape(tgt_len, bsz, self.embed_dim)
+        if self.attn_ln is not None:
+            x = self.attn_ln(x)
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        if self.encoder_attn is not None and encoder_out is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+            if prev_attn_state is not None:
+                prev_key, prev_value = prev_attn_state[:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_attn_state) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                assert incremental_state is not None
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
+            )
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        if self.ffn_layernorm is not None:
+            x = self.ffn_layernorm(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if self.w_resid is not None:
+            residual = torch.mul(self.w_resid, residual)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        if self.onnx_trace and incremental_state is not None:
+            saved_state = self.self_attn._get_input_buffer(incremental_state)
+            assert saved_state is not None
+            if self_attn_padding_mask is not None:
+                self_attn_state = [
+                    saved_state["prev_key"],
+                    saved_state["prev_value"],
+                    saved_state["prev_key_padding_mask"],
+                ]
+            else:
+                self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]]
+            return x, attn, self_attn_state
+        return x, attn, None
+
+    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+        self.need_attn = need_attn
+
+
+# backward compatible with the legacy argparse format
+class TransformerDecoderLayer(TransformerDecoderLayerBase):
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__(
+            TransformerConfig.from_namespace(args),
+            no_encoder_attn=no_encoder_attn,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+        self.args = args
+
+    def build_self_attention(
+        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+    ):
+        return super().build_self_attention(
+            embed_dim,
+            TransformerConfig.from_namespace(args),
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+
+    def build_encoder_attention(self, embed_dim, args):
+        return super().build_encoder_attention(
+            embed_dim,
+            TransformerConfig.from_namespace(args),
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
new file mode 100644
index 0000000000..7eb816978a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
@@ -0,0 +1,315 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional
+
+import torch
+from numpy.random import uniform
+from torch import Tensor
+
+from fairseq.modules import LayerNorm
+from fairseq.modules.transformer_layer import TransformerDecoderLayerBase
+
+
+class AugTransformerDecoderLayerBase(TransformerDecoderLayerBase):
+    """Decoder layer block augmented with an additional cross-attention.
+
+    This decoder block is processed with the sequence of the following sub-modules.
+        self-attention -> cross-attention (first) -> cross-attention (second) -> FFN
+
+    Args:
+        cfg (argparse.Namespace): parsed command-line arguments
+        encoder_attn_merge_type (str, optional): the way to combine outputs from
+            two cross-attention modules. If "sequential" is set, two cross-attention
+            modules are stacked sequentially. If "parallel" is set, they are processed
+            in parallel and combined before feeding it to FFN (default: sequential).
+        dropnet_ratio (float, optional): a probability to drop each cross-attention
+            module during training (default: 0.0).
+    """
+
+    def __init__(
+        self,
+        cfg,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        encoder_attn_merge_type="sequential",
+        dropnet_ratio=0.0,
+    ):
+        super().__init__(
+            cfg,
+            no_encoder_attn=False,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=False,
+        )
+        self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg)
+        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
+        self.encoder_attn2 = self.build_encoder_attention(self.embed_dim, cfg)
+        if encoder_attn_merge_type == "sequential":
+            self.encoder_attn_layer_norm2 = LayerNorm(self.embed_dim, export=cfg.export)
+        else:
+            self.encoder_attn_layer_norm2 = None
+
+        self.encoder_attn_merge_type = encoder_attn_merge_type
+        self.dropnet_ratio = dropnet_ratio
+
+    def forward(
+        self,
+        x,
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        encoder_out_aug: Optional[torch.Tensor] = None,
+        encoder_padding_mask2: Optional[torch.Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        prev_self_attn_state: Optional[List[torch.Tensor]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        if need_head_weights:
+            need_attn = True
+
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        if prev_self_attn_state is not None:
+            prev_key, prev_value = prev_self_attn_state[:2]
+            saved_state: Dict[str, Optional[Tensor]] = {
+                "prev_key": prev_key,
+                "prev_value": prev_value,
+            }
+            if len(prev_self_attn_state) >= 3:
+                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+            assert incremental_state is not None
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
+        if self.cross_self_attention and not (
+            incremental_state is not None
+            and _self_attn_input_buffer is not None
+            and "prev_key" in _self_attn_input_buffer
+        ):
+            if self_attn_mask is not None:
+                assert encoder_out is not None
+                self_attn_mask = torch.cat(
+                    (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
+                )
+            if self_attn_padding_mask is not None:
+                if encoder_padding_mask is None:
+                    assert encoder_out is not None
+                    encoder_padding_mask = self_attn_padding_mask.new_zeros(
+                        encoder_out.size(1), encoder_out.size(0)
+                    )
+                self_attn_padding_mask = torch.cat(
+                    (encoder_padding_mask, self_attn_padding_mask), dim=1
+                )
+            assert encoder_out is not None
+            y = torch.cat((encoder_out, x), dim=0)
+        else:
+            y = x
+
+        x, attn = self.self_attn(
+            query=x,
+            key=y,
+            value=y,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        if self.c_attn is not None:
+            tgt_len, bsz = x.size(0), x.size(1)
+            x = x.view(tgt_len, bsz, self.nh, self.head_dim)
+            x = torch.einsum("tbhd,h->tbhd", x, self.c_attn)
+            x = x.reshape(tgt_len, bsz, self.embed_dim)
+        if self.attn_ln is not None:
+            x = self.attn_ln(x)
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        assert encoder_out is not None
+        assert encoder_out_aug is not None
+
+        if self.encoder_attn_merge_type == "sequential":
+            ratios = self.get_dropnet_ratio()
+
+            # first encoder attention
+            if ratios[0] > 0:
+                residual = x
+                if self.normalize_before:
+                    x = self.encoder_attn_layer_norm(x)
+                if prev_attn_state is not None:
+                    prev_key, prev_value = prev_attn_state[:2]
+                    saved_state: Dict[str, Optional[Tensor]] = {
+                        "prev_key": prev_key,
+                        "prev_value": prev_value,
+                    }
+                    if len(prev_attn_state) >= 3:
+                        saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                    assert incremental_state is not None
+                    self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
+                x, attn = self.encoder_attn(
+                    query=x,
+                    key=encoder_out,
+                    value=encoder_out,
+                    key_padding_mask=encoder_padding_mask,
+                    incremental_state=incremental_state,
+                    static_kv=True,
+                    need_weights=need_attn or (not self.training and self.need_attn),
+                    need_head_weights=need_head_weights,
+                )
+                x = self.dropout_module(x)
+                x = self.residual_connection(x, residual)
+                if not self.normalize_before:
+                    x = self.encoder_attn_layer_norm(x)
+                x = ratios[0] * x
+
+            # second encoder attention
+            if ratios[1] > 0:
+                residual = x
+                if self.normalize_before:
+                    x = self.encoder_attn_layer_norm2(x)
+                if prev_attn_state is not None:
+                    prev_key, prev_value = prev_attn_state[:2]
+                    saved_state: Dict[str, Optional[Tensor]] = {
+                        "prev_key": prev_key,
+                        "prev_value": prev_value,
+                    }
+                    if len(prev_attn_state) >= 3:
+                        saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                    assert incremental_state is not None
+                    self.encoder_attn2._set_input_buffer(incremental_state, saved_state)
+
+                x, attn2 = self.encoder_attn2(
+                    query=x,
+                    key=encoder_out_aug,
+                    value=encoder_out_aug,
+                    key_padding_mask=encoder_padding_mask2,
+                    incremental_state=incremental_state,
+                    static_kv=True,
+                    need_weights=need_attn or (not self.training and self.need_attn),
+                    need_head_weights=need_head_weights,
+                )
+                x = self.dropout_module(x)
+                x = self.residual_connection(x, residual)
+                if not self.normalize_before:
+                    x = self.encoder_attn_layer_norm2(x)
+                x = ratios[1] * x
+
+        elif self.encoder_attn_merge_type == "parallel":
+            residual = x
+            if self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+            if prev_attn_state is not None:
+                prev_key, prev_value = prev_attn_state[:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_attn_state) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                assert incremental_state is not None
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
+            x1, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
+            )
+            x2, attn2 = self.encoder_attn2(
+                query=x,
+                key=encoder_out_aug,
+                value=encoder_out_aug,
+                key_padding_mask=encoder_padding_mask2,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
+            )
+            x1 = self.dropout_module(x1)
+            x2 = self.dropout_module(x2)
+            ratios = self.get_dropnet_ratio()
+            x = ratios[0] * x1 + ratios[1] * x2
+            x = self.residual_connection(x, residual)
+            if not self.normalize_before:
+                x = self.encoder_attn_layer_norm(x)
+
+        else:
+            raise NotImplementedError(self.encoder_attn_merge_type)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        if self.ffn_layernorm is not None:
+            x = self.ffn_layernorm(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if self.w_resid is not None:
+            residual = torch.mul(self.w_resid, residual)
+        x = self.residual_connection(x, residual)
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        if self.onnx_trace and incremental_state is not None:
+            saved_state = self.self_attn._get_input_buffer(incremental_state)
+            assert saved_state is not None
+            if self_attn_padding_mask is not None:
+                self_attn_state = [
+                    saved_state["prev_key"],
+                    saved_state["prev_value"],
+                    saved_state["prev_key_padding_mask"],
+                ]
+            else:
+                self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]]
+            return x, attn, attn2, self_attn_state
+        return x, attn, attn2, None
+
+    def get_dropnet_ratio(self):
+        if self.encoder_attn_merge_type == "sequential":
+            if self.dropnet_ratio > 0:
+                frand = float(uniform(0, 1))
+                if frand < self.dropnet_ratio and self.training:
+                    return [2, 0]
+                elif frand > 1 - self.dropnet_ratio and self.training:
+                    return [0, 2]
+                else:
+                    return [1, 1]
+            else:
+                return [1, 1]
+
+        elif self.encoder_attn_merge_type == "parallel":
+            if self.dropnet_ratio > 0:
+                frand = float(uniform(0, 1))
+                if frand < self.dropnet_ratio and self.training:
+                    return [1, 0]
+                elif frand > 1 - self.dropnet_ratio and self.training:
+                    return [0, 1]
+                else:
+                    return [0.5, 0.5]
+            else:
+                return [0.5, 0.5]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
new file mode 100644
index 0000000000..5d2db91ad7
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
@@ -0,0 +1,291 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from fairseq.modules import (
+    FairseqDropout,
+    LayerDropModuleList,
+    LayerNorm,
+    MultiheadAttention,
+    PositionalEmbedding,
+    TransformerSentenceEncoderLayer,
+)
+from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
+
+
+def init_bert_params(module):
+    """
+    Initialize the weights specific to the BERT Model.
+    This overrides the default initializations depending on the specified arguments.
+        1. If normal_init_linear_weights is set then weights of linear
+           layer will be initialized using the normal distribution and
+           bais will be set to the specified value.
+        2. If normal_init_embed_weights is set then weights of embedding
+           layer will be initialized using the normal distribution.
+        3. If normal_init_proj_weights is set then weights of
+           in_project_weight for MultiHeadAttention initialized using
+           the normal distribution (to be validated).
+    """
+
+    def normal_(data):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    if isinstance(module, MultiheadAttention):
+        normal_(module.q_proj.weight.data)
+        normal_(module.k_proj.weight.data)
+        normal_(module.v_proj.weight.data)
+
+
+class TransformerSentenceEncoder(nn.Module):
+    """
+    Implementation for a Bi-directional Transformer based Sentence Encoder used
+    in BERT/XLM style pre-trained models.
+
+    This first computes the token embedding using the token embedding matrix,
+    position embeddings (if specified) and segment embeddings
+    (if specified). After applying the specified number of
+    TransformerEncoderLayers, it outputs all the internal states of the
+    encoder as well as the final representation associated with the first
+    token (usually CLS token).
+
+    Input:
+        - tokens: B x T matrix representing sentences
+        - segment_labels: B x T matrix representing segment label for tokens
+
+    Output:
+        - a tuple of the following:
+            - a list of internal model states used to compute the
+              predictions where each tensor has shape T x B x C
+            - sentence representation associated with first input token
+              in format B x C.
+    """
+
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+    ) -> None:
+
+        super().__init__()
+        self.padding_idx = padding_idx
+        self.vocab_size = vocab_size
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.layerdrop = layerdrop
+        self.max_seq_len = max_seq_len
+        self.embedding_dim = embedding_dim
+        self.num_segments = num_segments
+        self.use_position_embeddings = use_position_embeddings
+        self.apply_bert_init = apply_bert_init
+        self.learned_pos_embedding = learned_pos_embedding
+        self.traceable = traceable
+
+        self.embed_tokens = self.build_embedding(
+            self.vocab_size, self.embedding_dim, self.padding_idx
+        )
+        self.embed_scale = embed_scale
+
+        if q_noise > 0:
+            self.quant_noise = apply_quant_noise_(
+                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
+                q_noise,
+                qn_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        self.segment_embeddings = (
+            nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None)
+            if self.num_segments > 0
+            else None
+        )
+
+        self.embed_positions = (
+            PositionalEmbedding(
+                self.max_seq_len,
+                self.embedding_dim,
+                padding_idx=(self.padding_idx if offset_positions_by_padding else None),
+                learned=self.learned_pos_embedding,
+            )
+            if self.use_position_embeddings
+            else None
+        )
+
+        if encoder_normalize_before:
+            self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export)
+        else:
+            self.emb_layer_norm = None
+
+        if self.layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                self.build_transformer_sentence_encoder_layer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=ffn_embedding_dim,
+                    num_attention_heads=num_attention_heads,
+                    dropout=self.dropout_module.p,
+                    attention_dropout=attention_dropout,
+                    activation_dropout=activation_dropout,
+                    activation_fn=activation_fn,
+                    export=export,
+                    q_noise=q_noise,
+                    qn_block_size=qn_block_size,
+                )
+                for _ in range(num_encoder_layers)
+            ]
+        )
+
+        # Apply initialization of model params after building the model
+        if self.apply_bert_init:
+            self.apply(init_bert_params)
+
+        def freeze_module_params(m):
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+        if freeze_embeddings:
+            freeze_module_params(self.embed_tokens)
+            freeze_module_params(self.segment_embeddings)
+            freeze_module_params(self.embed_positions)
+            freeze_module_params(self.emb_layer_norm)
+
+        for layer in range(n_trans_layers_to_freeze):
+            freeze_module_params(self.layers[layer])
+
+    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
+        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
+
+    def build_transformer_sentence_encoder_layer(
+        self,
+        embedding_dim,
+        ffn_embedding_dim,
+        num_attention_heads,
+        dropout,
+        attention_dropout,
+        activation_dropout,
+        activation_fn,
+        export,
+        q_noise,
+        qn_block_size,
+    ):
+        return TransformerSentenceEncoderLayer(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        segment_labels: torch.Tensor = None,
+        last_state_only: bool = False,
+        positions: Optional[torch.Tensor] = None,
+        token_embeddings: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        is_tpu = tokens.device.type == "xla"
+
+        # compute padding mask. This is needed for multi-head attention
+        padding_mask = tokens.eq(self.padding_idx)
+        if not self.traceable and not is_tpu and not padding_mask.any():
+            padding_mask = None
+
+        if token_embeddings is not None:
+            x = token_embeddings
+        else:
+            x = self.embed_tokens(tokens)
+
+        if self.embed_scale is not None:
+            x = x * self.embed_scale
+
+        if self.embed_positions is not None:
+            x = x + self.embed_positions(tokens, positions=positions)
+
+        if self.segment_embeddings is not None and segment_labels is not None:
+            x = x + self.segment_embeddings(segment_labels)
+
+        if self.quant_noise is not None:
+            x = self.quant_noise(x)
+
+        if self.emb_layer_norm is not None:
+            x = self.emb_layer_norm(x)
+
+        x = self.dropout_module(x)
+
+        # account for padding while computing the representation
+        if padding_mask is not None:
+            x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        inner_states = []
+        if not last_state_only:
+            inner_states.append(x)
+
+        for layer in self.layers:
+            x, _ = layer(
+                x, self_attn_padding_mask=padding_mask, self_attn_mask=attn_mask
+            )
+            if not last_state_only:
+                inner_states.append(x)
+
+        sentence_rep = x[0, :, :]
+
+        if last_state_only:
+            inner_states = [x]
+
+        if self.traceable:
+            return torch.stack(inner_states), sentence_rep
+        else:
+            return inner_states, sentence_rep
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000..f869c4b2f8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+
+
+class TransformerSentenceEncoderLayer(nn.Module):
+    """
+    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        export: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        init_fn: Callable = None,
+    ) -> None:
+        super().__init__()
+
+        if init_fn is not None:
+            init_fn()
+
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+        self.activation_dropout_module = FairseqDropout(
+            activation_dropout, module_name=self.__class__.__name__
+        )
+
+        # Initialize blocks
+        self.activation_fn = utils.get_activation_fn(activation_fn)
+        self.self_attn = self.build_self_attention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
+
+        self.fc1 = self.build_fc1(
+            self.embedding_dim,
+            ffn_embedding_dim,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+        self.fc2 = self.build_fc2(
+            ffn_embedding_dim,
+            self.embedding_dim,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
+
+    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def build_self_attention(
+        self,
+        embed_dim,
+        num_attention_heads,
+        dropout,
+        self_attention,
+        q_noise,
+        qn_block_size,
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            num_attention_heads,
+            dropout=dropout,
+            self_attention=True,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer implementation.
+        """
+        residual = x
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.self_attn_layer_norm(x)
+
+        residual = x
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = residual + x
+        x = self.final_layer_norm(x)
+        return x, attn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
new file mode 100644
index 0000000000..d7cca9a4bb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+transpose last 2 dimensions of the input
+"""
+
+import torch.nn as nn
+
+
+class TransposeLast(nn.Module):
+    def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
+        super().__init__()
+        self.deconstruct_idx = deconstruct_idx
+        self.tranpose_dim = tranpose_dim
+
+    def forward(self, x):
+        if self.deconstruct_idx is not None:
+            x = x[self.deconstruct_idx]
+        return x.transpose(self.tranpose_dim, -1)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
new file mode 100644
index 0000000000..bbaafbd6bf
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn.functional as F
+
+
+def unfold1d(x, kernel_size: int, padding_l: int, pad_value: float = 0):
+    """unfold T x B x C to T x B x C x K"""
+    if kernel_size > 1:
+        T, B, C = x.size()
+        x = F.pad(
+            x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value
+        )
+        x = x.as_strided((T, B, C, kernel_size), (B * C, C, 1, B * C))
+    else:
+        x = x.unsqueeze(3)
+    return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
new file mode 100644
index 0000000000..ee5ee19a34
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from collections.abc import Iterable
+from itertools import repeat
+
+import torch
+import torch.nn as nn
+
+
+def _pair(v):
+    if isinstance(v, Iterable):
+        assert len(v) == 2, "len(v) != 2"
+        return v
+    return tuple(repeat(v, 2))
+
+
+def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
+    sample_seq_len = 200
+    sample_bsz = 10
+    x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
+    # N x C x H x W
+    # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
+    x = conv_op(x)
+    # N x C x H x W
+    x = x.transpose(1, 2)
+    # N x H x C x W
+    bsz, seq = x.size()[:2]
+    per_channel_dim = x.size()[3]
+    # bsz: N, seq: H, CxW the rest
+    return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim
+
+
+class VGGBlock(torch.nn.Module):
+    """
+    VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf
+
+    Args:
+        in_channels: (int) number of input channels (typically 1)
+        out_channels: (int) number of output channels
+        conv_kernel_size: convolution channels
+        pooling_kernel_size: the size of the pooling window to take a max over
+        num_conv_layers: (int) number of convolution layers
+        input_dim: (int) input dimension
+        conv_stride: the stride of the convolving kernel.
+            Can be a single number or a tuple (sH, sW)  Default: 1
+        padding: implicit paddings on both sides of the input.
+            Can be a single number or a tuple (padH, padW). Default: None
+        layer_norm: (bool) if layer norm is going to be applied. Default: False
+
+    Shape:
+        Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+        Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        conv_kernel_size,
+        pooling_kernel_size,
+        num_conv_layers,
+        input_dim,
+        conv_stride=1,
+        padding=None,
+        layer_norm=False,
+    ):
+        assert (
+            input_dim is not None
+        ), "Need input_dim for LayerNorm and infer_conv_output_dim"
+        super(VGGBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_kernel_size = _pair(conv_kernel_size)
+        self.pooling_kernel_size = _pair(pooling_kernel_size)
+        self.num_conv_layers = num_conv_layers
+        self.padding = (
+            tuple(e // 2 for e in self.conv_kernel_size)
+            if padding is None
+            else _pair(padding)
+        )
+        self.conv_stride = _pair(conv_stride)
+
+        self.layers = nn.ModuleList()
+        for layer in range(num_conv_layers):
+            conv_op = nn.Conv2d(
+                in_channels if layer == 0 else out_channels,
+                out_channels,
+                self.conv_kernel_size,
+                stride=self.conv_stride,
+                padding=self.padding,
+            )
+            self.layers.append(conv_op)
+            if layer_norm:
+                conv_output_dim, per_channel_dim = infer_conv_output_dim(
+                    conv_op, input_dim, in_channels if layer == 0 else out_channels
+                )
+                self.layers.append(nn.LayerNorm(per_channel_dim))
+                input_dim = per_channel_dim
+            self.layers.append(nn.ReLU())
+
+        if self.pooling_kernel_size is not None:
+            pool_op = nn.MaxPool2d(kernel_size=self.pooling_kernel_size, ceil_mode=True)
+            self.layers.append(pool_op)
+            self.total_output_dim, self.output_dim = infer_conv_output_dim(
+                pool_op, input_dim, out_channels
+            )
+
+    def forward(self, x):
+        for i, _ in enumerate(self.layers):
+            x = self.layers[i](x)
+        return x
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
new file mode 100644
index 0000000000..bd0f911073
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+
+
+logger = logging.getLogger(__name__)
+
+
+class NanDetector:
+    """
+    Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name
+    """
+
+    def __init__(self, model, forward=True, backward=True):
+        self.bhooks = []
+        self.fhooks = []
+        self.forward = forward
+        self.backward = backward
+        self.named_parameters = list(model.named_parameters())
+        self.reset()
+
+        for name, mod in model.named_modules():
+            mod.__module_name = name
+            self.add_hooks(mod)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        # Dump out all model gnorms to enable better debugging
+        norm = {}
+        gradients = {}
+        for name, param in self.named_parameters:
+            if param.grad is not None:
+                grad_norm = torch.norm(param.grad.data.float(), p=2)
+                norm[name] = param.norm().item()
+                if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any():
+                    gradients[name] = param.grad.data
+        if len(gradients) > 0:
+            logger.info("Detected nan/inf grad norm, dumping norms...")
+            logger.info(f"norms: {norm}")
+            logger.info(f"gradients: {gradients}")
+
+        self.close()
+
+    def add_hooks(self, module):
+        if self.forward:
+            self.fhooks.append(module.register_forward_hook(self.fhook_fn))
+        if self.backward:
+            self.bhooks.append(module.register_backward_hook(self.bhook_fn))
+
+    def reset(self):
+        self.has_printed_f = False
+        self.has_printed_b = False
+
+    def _detect(self, tensor, name, backward):
+        err = None
+        if (
+            torch.is_floating_point(tensor)
+            # single value tensors (like the loss) will not provide much info
+            and tensor.numel() >= 2
+        ):
+            with torch.no_grad():
+                if torch.isnan(tensor).any():
+                    err = "NaN"
+                elif torch.isinf(tensor).any():
+                    err = "Inf"
+        if err is not None:
+            err = f"{err} detected in output of {name}, shape: {tensor.shape}, {'backward' if backward else 'forward'}"
+        return err
+
+    def _apply(self, module, inp, x, backward):
+        if torch.is_tensor(x):
+            if isinstance(inp, tuple) and len(inp) > 0:
+                inp = inp[0]
+            err = self._detect(x, module.__module_name, backward)
+            if err is not None:
+                if torch.is_tensor(inp) and not backward:
+                    err += (
+                        f" input max: {inp.max().item()}, input min: {inp.min().item()}"
+                    )
+
+                has_printed_attr = "has_printed_b" if backward else "has_printed_f"
+                logger.warning(err)
+                setattr(self, has_printed_attr, True)
+        elif isinstance(x, dict):
+            for v in x.values():
+                self._apply(module, inp, v, backward)
+        elif isinstance(x, list) or isinstance(x, tuple):
+            for v in x:
+                self._apply(module, inp, v, backward)
+
+    def fhook_fn(self, module, inp, output):
+        if not self.has_printed_f:
+            self._apply(module, inp, output, backward=False)
+
+    def bhook_fn(self, module, inp, output):
+        if not self.has_printed_b:
+            self._apply(module, inp, output, backward=True)
+
+    def close(self):
+        for hook in self.fhooks + self.bhooks:
+            hook.remove()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py
new file mode 100644
index 0000000000..4eb5030311
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py
@@ -0,0 +1,120 @@
+# Originally from Microsoft Corporation.
+# Licensed under the MIT License.
+
+""" Wrapper for ngram_repeat_block cuda extension """
+import math
+import warnings
+from typing import List
+
+import torch
+from torch import nn
+
+try:
+    from fairseq import ngram_repeat_block_cuda
+
+    EXTENSION_BUILT = True
+except ImportError:
+    EXTENSION_BUILT = False
+
+
+def is_cuda_extension_usable() -> bool:
+    """Check whether ngram_repeat_block_cuda is built properly"""
+    if not EXTENSION_BUILT or not torch.cuda.is_available():
+        return False
+    bsz = 2
+    tokens = torch.tensor([[4, 4, 3, 2], [1, 2, 3, 4]], dtype=torch.long, device="cuda")
+    lprobs = torch.rand((8, 12), device="cuda")
+    try:
+        outputs = ngram_repeat_block_cuda.forward(tokens, lprobs, bsz, 3, 4, 3)
+        outputs = outputs + 4  # This line breaks if the extension is built incorrectly.
+        return True
+    except RuntimeError:
+        warnings.warn(
+            "NGramRepeatBlock extension must be rebuilt."
+            'Run TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0" python setup.py build_ext --inplace'
+        )
+        return False
+
+
+class NGramRepeatBlock(nn.Module):
+    """Wrapper class for calling ngram_repeat_block cuda extension"""
+
+    def __init__(self, no_repeat_ngram_size: int, use_extension: bool = True):
+        super().__init__()
+        self.use_extension = is_cuda_extension_usable() if use_extension else False
+        self.no_repeat_ngram_size = no_repeat_ngram_size
+
+    def reset_parameters(self):
+        pass
+
+    @torch.jit.unused
+    def call_cuda_extension(
+        self,
+        tokens,
+        lprobs,
+        bsz: int,
+        beam_size: int,
+        step: int,
+    ):
+        return ngram_repeat_block_cuda.forward(
+            tokens, lprobs, bsz, step, beam_size, self.no_repeat_ngram_size
+        )
+
+    def forward(
+        self,
+        tokens,
+        lprobs,
+        bsz: int,
+        beam_size: int,
+        step: int,
+    ):
+        """
+        Args:
+            tokens(Tensor): Input tokens(Bsz*beam, seq_len)
+            lprobs(Tensor): likelihood probability,
+            Expected to be updated in place.(Bsz*beam, vocab_size)
+            bsz(int): batch size
+            step(int): current step
+            beam_size(int): beam size
+            no_repeat_ngram_size(int): Ngram size
+        """
+        msg = f"expected {bsz *beam_size} got"
+        assert tokens.size(0) == bsz * beam_size, f"{msg} {tokens.size(0)}"
+        assert lprobs.size(0) == bsz * beam_size, f"{msg} {lprobs.size(0)}"
+        if self.use_extension:
+            return self.call_cuda_extension(tokens, lprobs, bsz, beam_size, step)
+
+        else:
+            return self._no_repeat_ngram(
+                tokens,
+                lprobs,
+                bsz,
+                beam_size,
+                step,
+            )
+
+    def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int):
+        """For each hypothesis generate a list of previous ngrams and set associated lprobs to -inf"""
+        banned_tokens = [
+            torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size)
+        ]
+        if step + 2 - self.no_repeat_ngram_size >= 0:
+            cpu_tokens: List[List[int]] = tokens.cpu().tolist()
+            check_start_pos = step + 2 - self.no_repeat_ngram_size
+            for bbsz_idx in range(bsz * beam_size):
+                ngram_to_check = cpu_tokens[bbsz_idx][
+                    -(self.no_repeat_ngram_size - 1) :
+                ]
+                for i in range(check_start_pos):
+                    if (
+                        ngram_to_check
+                        == cpu_tokens[bbsz_idx][i : i + self.no_repeat_ngram_size - 1]
+                    ):
+                        banned_tokens[bbsz_idx].append(
+                            cpu_tokens[bbsz_idx][i + self.no_repeat_ngram_size - 1]
+                        )
+        for bbsz_idx in range(bsz * beam_size):
+            lprobs[bbsz_idx][
+                torch.tensor(banned_tokens[bbsz_idx], dtype=torch.int64)
+            ] = torch.tensor(-math.inf).to(lprobs)
+        return lprobs
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
new file mode 100644
index 0000000000..be783be896
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import importlib
+import os
+
+from fairseq import registry
+from fairseq.optim.bmuf import FairseqBMUF  # noqa
+from fairseq.optim.fairseq_optimizer import (  # noqa
+    FairseqOptimizer,
+    LegacyFairseqOptimizer,
+)
+from fairseq.optim.amp_optimizer import AMPOptimizer
+from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
+from fairseq.optim.shard import shard_
+from omegaconf import DictConfig
+
+__all__ = [
+    "AMPOptimizer",
+    "FairseqOptimizer",
+    "FP16Optimizer",
+    "MemoryEfficientFP16Optimizer",
+    "shard_",
+]
+
+(
+    _build_optimizer,
+    register_optimizer,
+    OPTIMIZER_REGISTRY,
+    OPTIMIZER_DATACLASS_REGISTRY,
+) = registry.setup_registry("--optimizer", base_class=FairseqOptimizer, required=True)
+
+
+def build_optimizer(cfg: DictConfig, params, *extra_args, **extra_kwargs):
+    if all(isinstance(p, dict) for p in params):
+        params = [t for p in params for t in p.values()]
+    params = list(filter(lambda p: p.requires_grad, params))
+    return _build_optimizer(cfg, params, *extra_args, **extra_kwargs)
+
+
+# automatically import any Python files in the optim/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        file_name = file[: file.find(".py")]
+        importlib.import_module("fairseq.optim." + file_name)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
new file mode 100644
index 0000000000..f1a2154977
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim
+
+from . import LegacyFairseqOptimizer, register_optimizer
+
+
+@register_optimizer("adadelta")
+class Adadelta(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adadelta-rho', type=float, default=0.9, metavar='RHO',
+                            help='coefficient used for computing a running average of squared gradients')
+        parser.add_argument('--adadelta-eps', type=float, default=1e-6, metavar='EPS',
+                            help='term added to the denominator to improve numerical stability')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--anneal-eps', action='store_true', help='flag to anneal eps')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "rho": self.args.adadelta_rho,
+            "eps": self.args.adadelta_eps,
+            "weight_decay": self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
new file mode 100644
index 0000000000..042ae926b0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.optim
+
+from . import LegacyFairseqOptimizer, register_optimizer
+
+
+@register_optimizer("adafactor")
+class FairseqAdafactor(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = Adafactor(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adafactor-eps', default='(1e-30, 1e-3)', metavar="E",
+                            help='epsilons for Adafactor optimizer')
+        parser.add_argument('--clip-threshold', type=float, default=1.0, metavar="C",
+                            help='threshold for clipping update root mean square')
+        parser.add_argument('--decay-rate', type=float, default=-0.8, metavar="D",
+                            help='decay rate of the second moment estimator')
+        parser.add_argument('--beta1', type=float, default=None, metavar="B",
+                            help='beta for first moment estimator. Optional')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--scale-parameter', action='store_true',
+                            help='scale learning rate by root mean square of parameter')
+        parser.add_argument('--relative-step', action='store_true',
+                            help='set learning rate to inverse square root of timestep,'
+                                 'otherwise use external learning rate')
+        parser.add_argument('--warmup-init', action='store_true',
+                            help='use relative step for warm-up learning rate schedule')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        Note : Convergence issues empirically observed with fp16 on.
+               Might require search for appropriate configuration.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "eps": eval(self.args.adafactor_eps),
+            "clip_threshold": self.args.clip_threshold,
+            "decay_rate": self.args.decay_rate,
+            "beta1": self.args.beta1,
+            "weight_decay": self.args.weight_decay,
+            "scale_parameter": self.args.scale_parameter,  # defaults to False
+            "relative_step": self.args.relative_step,  # defaults to False
+            "warmup_init": self.args.warmup_init,
+        }
+
+
+class Adafactor(torch.optim.Optimizer):
+    """Implements Adafactor algorithm.
+
+    This implementation is based on:
+    `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
+    (see https://arxiv.org/abs/1804.04235)
+
+    Note that this optimizer internally adjusts the learning rate
+    depending on the *scale_parameter*, *relative_step* and
+    *warmup_init* options. To use a manual (external) learning rate
+    schedule you should set `scale_parameter=False` and
+    `relative_step=False`.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): external learning rate (default: None)
+        eps (tuple[float, float]): regularization constans for square gradient
+            and parameter scale respectively (default: (1e-30, 1e-3))
+        clip_threshold (float): threshold of root mean square of
+            final gradient update (default: 1.0)
+        decay_rate (float): coefficient used to compute running averages of square
+            gradient (default: -0.8)
+        beta1 (float): coefficient used for computing running averages of gradient
+            (default: None)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        scale_parameter (bool): if True, learning rate is scaled by root mean square of
+            parameter (default: True)
+        relative_step (bool): if True, time-dependent learning rate is computed
+            instead of external learning rate (default: True)
+        warmup_init (bool): time-dependent learning rate computation depends on
+            whether warm-up initialization is being used (default: False)
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=None,
+        eps=(1e-30, 1e-3),
+        clip_threshold=1.0,
+        decay_rate=-0.8,
+        beta1=None,
+        weight_decay=0.0,
+        scale_parameter=True,
+        relative_step=True,
+        warmup_init=False,
+    ):
+        if lr is not None and relative_step:
+            raise ValueError("Cannot combine manual lr and relative_step options")
+        if warmup_init and not relative_step:
+            raise ValueError("warmup_init requires relative_step=True")
+
+        defaults = dict(
+            lr=lr,
+            eps=eps,
+            clip_threshold=clip_threshold,
+            decay_rate=decay_rate,
+            beta1=beta1,
+            weight_decay=weight_decay,
+            scale_parameter=scale_parameter,
+            relative_step=relative_step,
+            warmup_init=warmup_init,
+        )
+        super(Adafactor, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return False
+
+    def _get_lr(self, param_group, param_state):
+        rel_step_sz = param_group["lr"]
+        if param_group["relative_step"]:
+            min_step = (
+                1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-2
+            )
+            rel_step_sz = min(min_step, 1.0 / math.sqrt(param_state["step"]))
+        param_scale = 1.0
+        if param_group["scale_parameter"]:
+            param_scale = max(param_group["eps"][1], param_state["RMS"])
+        return param_scale * rel_step_sz
+
+    def _get_options(self, param_group, param_shape):
+        factored = len(param_shape) >= 2
+        use_first_moment = param_group["beta1"] is not None
+        return factored, use_first_moment
+
+    def _rms(self, tensor):
+        return tensor.norm(2) / (tensor.numel() ** 0.5)
+
+    def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col):
+        r_factor = (
+            (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True))
+            .rsqrt_()
+            .unsqueeze(-1)
+        )
+        c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
+        return torch.mul(r_factor, c_factor)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                if grad.is_sparse:
+                    raise RuntimeError("Adafactor does not support sparse gradients.")
+
+                state = self.state[p]
+                grad_shape = grad.shape
+
+                factored, use_first_moment = self._get_options(group, grad_shape)
+                # State Initialization
+                if len(state) == 0:
+                    state["step"] = 0
+
+                    if use_first_moment:
+                        # Exponential moving average of gradient values
+                        state["exp_avg"] = torch.zeros_like(grad)
+                    if factored:
+                        state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
+                        state["exp_avg_sq_col"] = torch.zeros(
+                            grad_shape[:-2] + grad_shape[-1:]
+                        ).to(grad)
+                    else:
+                        state["exp_avg_sq"] = torch.zeros_like(grad)
+
+                    state["RMS"] = 0
+                else:
+                    if use_first_moment:
+                        state["exp_avg"] = state["exp_avg"].to(grad)
+                    if factored:
+                        state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
+                        state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
+                    else:
+                        state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                state["step"] += 1
+                state["RMS"] = self._rms(p_data_fp32)
+                group["lr"] = self._get_lr(group, state)
+
+                beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
+                update = (grad**2) + group["eps"][0]
+                if factored:
+                    exp_avg_sq_row = state["exp_avg_sq_row"]
+                    exp_avg_sq_col = state["exp_avg_sq_col"]
+
+                    exp_avg_sq_row.mul_(beta2t).add_(
+                        update.mean(dim=-1), alpha=1.0 - beta2t
+                    )
+                    exp_avg_sq_col.mul_(beta2t).add_(
+                        update.mean(dim=-2), alpha=1.0 - beta2t
+                    )
+
+                    # Approximation of exponential moving average of square of gradient
+                    update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
+                    update.mul_(grad)
+                else:
+                    exp_avg_sq = state["exp_avg_sq"]
+
+                    exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t)
+                    update = exp_avg_sq.rsqrt().mul_(grad)
+
+                update.div_(
+                    (self._rms(update) / group["clip_threshold"]).clamp_(min=1.0)
+                )
+                update.mul_(group["lr"])
+
+                if use_first_moment:
+                    exp_avg = state["exp_avg"]
+                    exp_avg.mul_(group["beta1"]).add_(update, alpha=1 - group["beta1"])
+                    update = exp_avg
+
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(
+                        p_data_fp32, alpha=-group["weight_decay"] * group["lr"]
+                    )
+
+                p_data_fp32.add_(-update)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
new file mode 100644
index 0000000000..4f539541c1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim
+
+from . import LegacyFairseqOptimizer, register_optimizer
+
+
+@register_optimizer("adagrad")
+class Adagrad(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "weight_decay": self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
new file mode 100644
index 0000000000..678ec7c617
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
@@ -0,0 +1,239 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import math
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import Any, List
+
+import torch
+import torch.distributed as dist
+import torch.optim
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim import FairseqOptimizer, register_optimizer
+from fairseq.optim.fused_adam import get_fused_adam_class
+from omegaconf import II, OmegaConf
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FairseqAdamConfig(FairseqDataclass):
+    adam_betas: Any = field(
+        default=(0.9, 0.999), metadata={"help": "betas for Adam optimizer"}
+    )
+    adam_eps: float = field(
+        default=1e-8, metadata={"help": "epsilon for Adam optimizer"}
+    )
+    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
+    use_old_adam: bool = field(
+        default=False, metadata={"help": "Use fairseq.optim.adam.Adam"}
+    )
+    fp16_adam_stats: bool = field(
+        default=False, metadata={"help": "use FP16 stats (with automatic scaling)"}
+    )
+    # TODO common vars below in parent
+    tpu: bool = II("common.tpu")
+    lr: List[float] = II("optimization.lr")
+
+
+@register_optimizer("adam", dataclass=FairseqAdamConfig)
+class FairseqAdam(FairseqOptimizer):
+    """Adam optimizer for fairseq.
+
+    Important note: this optimizer corresponds to the "AdamW" variant of
+    Adam in its weight decay behavior. As such, it is most closely
+    analogous to torch.optim.AdamW from PyTorch.
+    """
+
+    def __init__(self, cfg: FairseqAdamConfig, params):
+        super().__init__(cfg)
+        fused_adam_cls = get_fused_adam_class()
+        use_fused_adam = (
+            not getattr(cfg, "use_old_adam", False)
+            and fused_adam_cls is not None
+            and torch.cuda.is_available()
+        )
+        if getattr(cfg, "tpu", False):
+            if self.cfg.fp16_adam_stats:
+                raise NotImplementedError("--fp16-adam-stats is only supported on GPU")
+            # on TPUs we use the Adam defined here, since it
+            # automatically casts gradients to FP32
+            self._optimizer = Adam(params, **self.optimizer_config)
+        elif use_fused_adam:
+            logger.info("using FusedAdam")
+            self._optimizer = fused_adam_cls(
+                params, use_fp16_stats=self.cfg.fp16_adam_stats, **self.optimizer_config
+            )
+        else:
+            if self.cfg.fp16_adam_stats:
+                raise NotImplementedError(
+                    "--fp16-adam-stats is only supported with FusedAdamV1"
+                )
+            self._optimizer = Adam(params, **self.optimizer_config)
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.cfg.lr[0]
+            if isinstance(self.cfg.lr, Collection)
+            else self.cfg.lr,
+            "betas": eval(self.cfg.adam_betas)
+            if isinstance(self.cfg.adam_betas, str)
+            else OmegaConf.to_container(self.cfg.adam_betas),
+            "eps": self.cfg.adam_eps,
+            "weight_decay": self.cfg.weight_decay,
+        }
+
+    def average_params(self):
+        """Reduce Params is only used during BMUF distributed training."""
+        state_dict = self.optimizer.state_dict()
+        total_gpus = float(dist.get_world_size())
+
+        for _, value in state_dict["state"].items():
+            value["exp_avg"] /= total_gpus
+            value["exp_avg_sq"] /= total_gpus
+            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
+            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
+
+
+class Adam(torch.optim.Optimizer):
+    r"""Implements Adam algorithm.
+
+    This implementation is modified from torch.optim.Adam based on:
+    `Fixed Weight Decay Regularization in Adam`
+    (see https://arxiv.org/abs/1711.05101)
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        amsgrad=False,
+    ):
+        defaults = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad
+        )
+        super(Adam, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.dtype in {torch.float16, torch.bfloat16}:
+                    grad = grad.float()
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "Adam does not support sparse gradients, please consider SparseAdam instead"
+                    )
+                amsgrad = group.get("amsgrad", False)
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+                else:
+                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
+                    if amsgrad:
+                        state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to(
+                            p_data_fp32
+                        )
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                if amsgrad:
+                    max_exp_avg_sq = state["max_exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+
+                state["step"] += 1
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group["eps"])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
+
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
+                step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
+
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(
+                        p_data_fp32, alpha=-group["weight_decay"] * group["lr"]
+                    )
+
+                p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
new file mode 100644
index 0000000000..98ff8ad7ad
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.optim
+
+from . import LegacyFairseqOptimizer, register_optimizer
+
+
+@register_optimizer("adamax")
+class FairseqAdamax(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = Adamax(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--adamax-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for Adam optimizer')
+        parser.add_argument('--adamax-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for Adam optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        parser.add_argument('--no-bias-correction', default=False, action='store_true',
+                            help='disable bias correction')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "betas": eval(self.args.adamax_betas),
+            "eps": self.args.adamax_eps,
+            "weight_decay": self.args.weight_decay,
+            "bias_correction": not self.args.no_bias_correction,
+        }
+
+
+class Adamax(torch.optim.Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Compared to the version in PyTorch, this version implements a fix for weight decay.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        bias_correction (bool, optional): enable bias correction (default: True)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=2e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        bias_correction=True,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            bias_correction=bias_correction,
+        )
+        super(Adamax, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError("Adamax does not support sparse gradients")
+
+                p_data_fp32 = p.data
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_inf"] = torch.zeros_like(p_data_fp32)
+                else:
+                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
+                    state["exp_inf"] = state["exp_inf"].to(p_data_fp32)
+
+                exp_avg, exp_inf = state["exp_avg"], state["exp_inf"]
+                beta1, beta2 = group["betas"]
+                eps = group["eps"]
+
+                state["step"] += 1
+
+                # Update biased first moment estimate.
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+
+                # Update the exponentially weighted infinity norm.
+                torch.max(
+                    exp_inf.mul_(beta2),
+                    grad.abs_(),
+                    out=exp_inf,
+                )
+
+                step_size = group["lr"]
+                if group["bias_correction"]:
+                    bias_correction = 1 - beta1 ** state["step"]
+                    step_size /= bias_correction
+
+                if group["weight_decay"] != 0:
+                    p_data_fp32.add_(
+                        p_data_fp32, alpha=-group["weight_decay"] * group["lr"]
+                    )
+
+                p_data_fp32.addcdiv_(exp_avg, exp_inf.add(eps), value=-step_size)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
new file mode 100644
index 0000000000..cfe57d07f9
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
@@ -0,0 +1,106 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+from fairseq import optim
+from omegaconf import DictConfig
+
+logger = logging.getLogger(__name__)
+
+
+class AMPOptimizer(optim.FairseqOptimizer):
+    """
+    Wrap an *optimizer* to support AMP (automatic mixed precision) training.
+    """
+
+    def __init__(self, cfg: DictConfig, params, fp32_optimizer, **kwargs):
+        super().__init__(cfg.optimizer)
+        self.fp32_optimizer = fp32_optimizer
+        amp_kwargs = {"init_scale": cfg.common.fp16_init_scale}
+        if getattr(cfg.common, "amp_scale_window", None) is not None:
+            amp_kwargs["growth_interval"] = cfg.common.amp_init_scale
+        self._grad_scaler = torch.cuda.amp.GradScaler(**amp_kwargs)
+        self.min_loss_scale = cfg.common.min_loss_scale
+
+    @classmethod
+    def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
+        """
+        Args:
+            cfg (omegaconf.DictConfig): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        fp32_optimizer = optim.build_optimizer(cfg.optimizer, params)
+        return cls(cfg, params, fp32_optimizer, **kwargs)
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        self._grad_scaler.scale(loss).backward()
+
+    def step(self):
+        self.scaler.step(self.fp32_optimizer)
+        self.scaler.update()
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        self.scaler.unscale_(self.optimizer)
+        grad_norm = self.fp32_optimizer.clip_grad_norm(max_norm, aggregate_norm_fn)
+        if not torch.isfinite(grad_norm).all():
+            new_loss_scale = self.next_loss_scale
+            if new_loss_scale <= self.min_loss_scale:
+                raise FloatingPointError(
+                    (
+                        "AMP: Minimum loss scale reached ({}). Your loss is probably exploding. "
+                        "Try restarting training or use fp32. {}"
+                    ).format(self.min_loss_scale, new_loss_scale)
+                )
+            else:
+                logger.info(
+                    "AMP: overflow detected, setting scale to " f"to {new_loss_scale}"
+                )
+        return grad_norm
+
+    @property
+    def scaler(self):
+        return self._grad_scaler
+
+    @property
+    def next_loss_scale(self):
+        return self.scaler.get_scale() * self.scaler.get_backoff_factor()
+
+    @property
+    def optimizer(self):
+        return self.fp32_optimizer.optimizer
+
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        self.fp32_optimizer.optimizer = optimizer
+
+    @property
+    def lr_scheduler(self):
+        return getattr(self.fp32_optimizer, "lr_scheduler", None)
+
+    @property
+    def optimizer_config(self):
+        return self.fp32_optimizer.optimizer_config
+
+    def get_lr(self):
+        return self.fp32_optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self.fp32_optimizer.set_lr(lr)
+
+    def all_reduce_grads(self, module):
+        self.fp32_optimizer.all_reduce_grads(module)
+
+    @property
+    def supports_flat_params(self):
+        return self.fp32_optimizer.supports_flat_params
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
new file mode 100644
index 0000000000..d6d0e04e86
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
@@ -0,0 +1,200 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+import torch
+import torch.distributed as dist
+from fairseq.dataclass.configs import FairseqBMUFConfig
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.optim.fairseq_optimizer import FairseqOptimizer
+
+
+class FairseqBMUF(FairseqOptimizer):
+    """
+    Implements incremental block distributed data parallelism similar to
+    https://ieeexplore.ieee.org/document/7472805
+
+    Paper title: Scalable training of deep learning machines by incremental
+    block training with intra-block parallel optimization and blockwise
+    model-update filtering
+    """
+
+    def __init__(self, cfg: FairseqBMUFConfig, optimizer):
+        super().__init__(cfg)
+        self._optimizer = optimizer
+        self._num_updates = 0
+        self.sync_iter = cfg.global_sync_iter
+        self.block_momentum = cfg.block_momentum
+        self.block_lr = cfg.block_lr
+        self._reset_local_data()
+        self.warmup_iteration = cfg.warmup_iterations
+        self.use_nbm = cfg.use_nbm
+        self.initial_state = self._optimizer.state_dict()
+        self.average_sync = self.cfg.average_sync
+        self.world_size = self.cfg.distributed_world_size
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        gen_parser_from_dataclass(parser, FairseqBMUFConfig())
+
+    @property
+    def optimizer(self):
+        return self._optimizer.optimizer
+
+    @property
+    def optimizer_config(self):
+        return self._optimizer.optimizer_config
+
+    def get_lr(self):
+        return self._optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self._optimizer.set_lr(lr)
+
+    def state_dict(self):
+        return self._optimizer.state_dict()
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        self._optimizer.load_state_dict(state_dict, optimizer_overrides)
+        self.initial_state = self._optimizer.state_dict()
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        self._optimizer.multiply_grads(c)
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        return self._optimizer.clip_grad_norm(max_norm, aggregate_norm_fn)
+
+    def average_params(self):
+        self._optimizer.average_params()
+
+    def _block_sync(self):
+        if self.world_size <= 1:
+            return
+        # Update the global model using local models from all GPUs
+        # (Step-1) Calculate grad between previously synced model and
+        # currrent local model
+        if self.block_momentum != 0:
+            self._calc_grad()
+
+        # (Step-2) Average gradient from all GPUs
+        self._avg_grad_from_all_gpus()
+
+        # (Step-3) Calculate global momentum and update the global model
+        if self.block_momentum != 0:
+            self._update_global_model()
+
+        # (Step-4) Average local optimizer params
+        if self.average_sync:
+            self.average_params()
+
+    def _is_warmup_end(self):
+        # Check whether train iterations is equal to warmup iter
+        if self.get_num_updates() == self.warmup_iteration:
+            return True
+        return False
+
+    def _is_bmuf_iter(self):
+        # Check whether train iterations is equal to bmuf sync iter
+        if (self.get_num_updates() > self.warmup_iteration) and (
+            self.get_num_updates() % self.sync_iter == 0
+        ):
+            return True
+        return False
+
+    def _warmup_sync(self, root_rank=0):
+        if self.world_size <= 1:
+            return
+        # Broadcast the local model to all gpus
+        for param in self.params:
+            dist.broadcast(param.data, src=root_rank)
+
+        # Update local optimizer state
+        if self.average_sync:
+            self._optimizer.average_params()
+        else:
+            self._optimizer.load_state_dict(self.initial_state)
+
+        self._reset_local_data()
+
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        self._optimizer.step(closure)
+        self.set_num_updates(self.get_num_updates() + 1)
+        if self._is_warmup_end():
+            self._warmup_sync()
+        elif self._is_bmuf_iter():
+            self._block_sync()
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        self._optimizer.zero_grad()
+
+    def get_num_updates(self):
+        """Get the number of parameters updates."""
+        return self._num_updates
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        self._num_updates = num_updates
+
+    @torch.no_grad()
+    def _reset_local_data(self):
+        # (Step-0) Initialize global momentum parameters and store global copy on each gpu
+        self.global_params = [torch.zeros_like(p.data) for p in self.params]
+        self.smoothed_grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+        self.grads = [p.data.new_zeros(p.data.size()) for p in self.params]
+
+        # saving the global model locally for calculating gradient during bmuf sync
+        for param, global_param in zip(self.params, self.global_params):
+            global_param.copy_(param.data)
+
+    @torch.no_grad()
+    def _calc_grad(self):
+        # global_params is basically the global copy from the previously finished
+        # synchronisation. param.data is local parameter after block_sync_freq
+        # for the local gpu. so grad is difference between previously synced
+        # model and currrent local model.
+        for index, (param, global_param) in enumerate(
+            zip(self.params, self.global_params)
+        ):
+            self.grads[index] = global_param - param.data
+
+    def _avg_grad_from_all_gpus(self):
+        for index, param in enumerate(self.params):
+            sync_para = param.data if self.block_momentum == 0 else self.grads[index]
+            sync_para /= float(dist.get_world_size())
+            dist.all_reduce(sync_para, op=dist.ReduceOp.SUM)
+
+    @torch.no_grad()
+    def _update_global_model(self):
+        for index, (param, global_param, smoothed_grad, grad) in enumerate(
+            zip(
+                self.params,
+                self.global_params,
+                self.smoothed_grads,
+                # all gpus would share the same value of smoothed_grad, since it is
+                # always computed on synchronized gradients.
+                self.grads,
+            )
+        ):
+            # global_param is basically last syncrhornized parameter. though
+            # smoothed_grad is local, all processes will have same value of
+            # smoothed_grad and hence param is globally synchronized copy.
+            # smoothed_grad(t) = BM * smoothed_grad(t-1) + BM_lr * grad(t)
+            smoothed_grad = self.block_momentum * smoothed_grad + self.block_lr * grad
+            param.data.copy_(global_param - smoothed_grad)
+
+            # A Nesterov momentum here is to do a partial weight update before
+            # calculating the gradient
+            if self.use_nbm:
+                param.data.copy_(param.data - self.block_momentum * smoothed_grad)
+
+            # backup for the next synchronization.
+            self.smoothed_grads[index] = smoothed_grad
+            global_param.copy_(param.data)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
new file mode 100644
index 0000000000..1ef0114ed6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
@@ -0,0 +1,273 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Any, List, Optional
+
+import torch.optim
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim import FairseqOptimizer, register_optimizer, _build_optimizer
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, build_lr_scheduler
+from omegaconf import II, open_dict
+import copy
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OptimizerAndSchedulerConfig(FairseqDataclass):
+    optimizer: Any = None
+    lr_scheduler: Optional[Any] = None
+    lr: List = II("optimization.lr")
+    lr_float: Optional[
+        float
+    ] = None  # this makes it easier to sweep on learning rate with auto sweepers
+
+
+@dataclass
+class CompositeOptimizerConfig(FairseqDataclass):
+    groups: Dict[str, Any] = field(
+        default_factory=lambda: {},
+        metadata={
+            "help": "optimizer name -> optimizer OptimizerAndSchedulerConfig. "
+            "Configures a different optimizer and (optionally) lr scheduler for each parameter group"
+        },
+    )
+    dynamic_groups: bool = field(
+        default=False,
+        metadata={
+            "help": "create groups dynamically based on parameters, if set to False, all parameters needs to have group_names"
+        },
+    )
+
+
+@register_optimizer("composite", dataclass=CompositeOptimizerConfig)
+class FairseqCompositeOptimizer(FairseqOptimizer):
+
+    optimizers: Dict[str, FairseqOptimizer] = {}
+    lr_schedulers: Dict[str, FairseqLRScheduler] = {}
+    lr_scheduler: FairseqLRScheduler = None
+    _optimizer: torch.optim.Optimizer
+
+    def __init__(self, cfg: CompositeOptimizerConfig, params):
+        super().__init__(cfg)
+
+        assert (
+            len(params) > 1
+        ), "Composite optimizer only works when there are multiple parameter groups (try fp16_no_flatten_grads: true)"
+
+        def dict_hash(dictionary: Dict[str, Any]) -> str:
+            import hashlib
+            import json
+
+            dhash = hashlib.md5()
+            encoded = json.dumps(dictionary, sort_keys=True).encode()
+            dhash.update(encoded)
+            return dhash.hexdigest()
+
+        groupped_params = defaultdict(list)
+        overrides = defaultdict(dict)
+        if not cfg.dynamic_groups:
+            for p in params:
+                group = getattr(p, "param_group", "default")
+                override_config = getattr(p, "optim_overrides", None)
+                if override_config is not None and bool(override_config):
+                    overrides[group] = override_config
+                else:
+                    assert (
+                        override_config == None or override_config == overrides[group]
+                    ), f"For group {group}, different overrides found {override_config} v/s {overrides[group]}"
+                groupped_params[group].append(p)
+
+            for p, params in groupped_params.items():
+                override_config = getattr(params[0], "optim_overrides", None)
+                if override_config is not None:
+                    for pp in params[1:]:
+                        assert override_config == getattr(
+                            pp, "optim_overrides", None
+                        ), f" {str(override_config)} != {str(getattr(pp, 'optim_overrides', None))}"
+        else:
+            for p in params:
+                group = getattr(p, "param_group", "default")
+                override_config = getattr(p, "optim_overrides", None)
+                if override_config is not None:
+                    override_config["group_name"] = group
+                    group_name = dict_hash(override_config)
+                    overrides[group_name] = override_config
+                else:
+                    group_name = group
+                groupped_params[group_name].append(p)
+
+        self.optimizers_config = {}
+        for group, group_params in groupped_params.items():
+            p_group = group
+            if group in overrides and "group_name" in overrides[group]:
+                p_group = overrides[group]["group_name"]
+            if group in cfg.groups:
+                group_cfg = cfg.groups[group]
+                optimizer_config = copy.deepcopy(group_cfg.optimizer)
+                scheduler_config = copy.deepcopy(group_cfg.lr_scheduler)
+                explicit_group_present = True
+            else:
+                group_cfg = cfg.groups[p_group]
+                optimizer_config = copy.deepcopy(group_cfg.optimizer)
+                scheduler_config = copy.deepcopy(group_cfg.lr_scheduler)
+                explicit_group_present = False
+
+            if getattr(group_cfg, "lr_float", None) is not None:
+                with open_dict(optimizer_config):
+                    optimizer_config.lr = [group_cfg.lr_float]
+
+            if group in overrides and "optimizer" in overrides[group]:
+                with open_dict(optimizer_config):
+                    if "lr_scale" in overrides[group]["optimizer"]:
+                        lr_scale = overrides[group]["optimizer"]["lr_scale"]
+                        optimizer_config.lr = [
+                            lr * lr_scale for lr in optimizer_config.lr
+                        ]
+
+                        if explicit_group_present:
+                            logger.info(
+                                f"For group:{group}, config as well as override present for lr"
+                            )
+
+                    if (
+                        "weight_decay_scale" in overrides[group]["optimizer"]
+                        and "optimizer_config" in optimizer_config
+                    ):
+                        weight_decay_scale = overrides[group]["optimizer"][
+                            "weight_decay_scale"
+                        ]
+                        optimizer_config.weight_decay = (
+                            optimizer_config.weight_decay * weight_decay_scale
+                        )
+                        if explicit_group_present:
+                            logger.info(
+                                f"For group:{group}, config as well as override present for weight_decay"
+                            )
+
+            with open_dict(scheduler_config):
+                scheduler_config.lr = optimizer_config.lr
+            self.optimizers[group] = _build_optimizer(optimizer_config, group_params)
+            self.optimizers_config[group] = optimizer_config
+            if scheduler_config is not None:
+                self.lr_schedulers[group] = build_lr_scheduler(
+                    scheduler_config, self.optimizers[group]
+                )
+        logger.info("Optimizers for different groups are as below")
+        for group in self.optimizers_config.keys():
+            logger.info(f"Group : {group}:{self.optimizers_config[group]}")
+        if len(self.lr_schedulers) > 0:
+            assert len(self.lr_schedulers) == len(self.optimizers), (
+                f"Please provide an lr scheduler for each optimizer to use pass_through scheduler. "
+                f"Optimizers: {self.optimizers}; Lr scheds: {self.lr_schedulers}"
+            )
+            self.lr_scheduler = CompositeLRScheduler(self.lr_schedulers)
+
+        self._optimizer = CompositeOptimizer(self.optimizers)
+
+    @property
+    def supports_groups(self):
+        return True
+
+    @property
+    def param_groups(self):
+        for opt in self.optimizers.values():
+            for group in opt.param_groups:
+                yield group
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        k = (
+            "default"
+            if "default" in self.optimizers
+            else next(iter(self.optimizers.keys()))
+        )
+        return self.optimizers[k].param_groups[0]["lr"]
+
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {k: s.state_dict() for k, s in self.optimizers.items()}
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an LR scheduler state dict."""
+        for k, state in state_dict.items():
+            if k not in self.optimizers:
+                # skip extra keys like "loss_scale" added by fp16 optimizer
+                continue
+
+            overrides = (
+                optimizer_overrides[k]
+                if isinstance(optimizer_overrides, dict) and k in optimizer_overrides
+                else None
+            )
+            self.optimizers[k].load_state_dict(state, optimizer_overrides=overrides)
+
+
+class CompositeOptimizer(torch.optim.Optimizer):
+    def __init__(self, optimizers: Dict[str, FairseqOptimizer]):
+        self.optimizers = optimizers
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return all(o.supports_memory_efficient_fp16 for o in self.optimizers.values())
+
+    @property
+    def supports_flat_params(self):
+        return all(o.supports_flat_params for o in self.optimizers.values())
+
+    def step(self, closure=None, groups=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for k, opt in self.optimizers.items():
+            if groups is None or k in groups:
+                opt.step()
+
+        return loss
+
+    def zero_grad(self):
+        for opt in self.optimizers.values():
+            opt.zero_grad()
+
+
+class CompositeLRScheduler(FairseqLRScheduler):
+    def __init__(self, lr_schedulers):
+        super().__init__(None, None)
+
+        self.lr_schedulers = lr_schedulers
+
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {k: s.state_dict() for k, s in self.lr_schedulers.items()}
+
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        for k, state in state_dict.items():
+            self.lr_schedulers[k].load_state_dict(state)
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        for s in self.lr_schedulers.values():
+            s.step_begin_epoch(epoch)
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        for s in self.lr_schedulers.values():
+            s.step(epoch)
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return {k: s.step_update(num_updates) for k, s in self.lr_schedulers.items()}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
new file mode 100644
index 0000000000..b218934e71
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
@@ -0,0 +1,210 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+
+import torch
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim import FairseqOptimizer, register_optimizer
+from omegaconf import II, DictConfig
+
+
+try:
+    import deepspeed
+
+    has_deepspeed = True
+except ImportError as e:
+    has_deepspeed = False
+
+
+def _get_cpu_adam():
+    try:
+        from deepspeed.ops.op_builder import CPUAdamBuilder
+
+        return CPUAdamBuilder().load()
+    except ImportError:
+        # fbcode
+        from deepspeed.ops.adam import DeepSpeedCPUAdam as ds_opt_adam
+
+        return ds_opt_adam
+
+
+@dataclass
+class FairseqCPUAdamConfig(FairseqDataclass):
+    adam_betas: str = field(
+        default="(0.9, 0.999)", metadata={"help": "betas for Adam optimizer"}
+    )
+    adam_eps: float = field(
+        default=1e-8, metadata={"help": "epsilon for Adam optimizer"}
+    )
+    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
+    fp16_adam_stats: bool = field(
+        default=False, metadata={"help": "use FP16 stats (with automatic scaling)"}
+    )
+    # TODO common vars below in parent
+    lr: List[float] = II("optimization.lr")
+
+
+@register_optimizer("cpu_adam", dataclass=FairseqCPUAdamConfig)
+class FairseqCPUAdam(FairseqOptimizer):
+    """Adam optimizer for fairseq, optimized for CPU tensors.
+
+    Important note: this optimizer corresponds to the "AdamW" variant of
+    Adam in its weight decay behavior. As such, it is most closely
+    analogous to torch.optim.AdamW from PyTorch.
+    """
+
+    def __init__(self, cfg: DictConfig, params):
+        super().__init__(cfg)
+        self._optimizer = CPUAdam(params, **self.optimizer_config)
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.cfg.lr[0]
+            if isinstance(self.cfg.lr, Collection)
+            else self.cfg.lr,
+            "betas": eval(self.cfg.adam_betas),
+            "eps": self.cfg.adam_eps,
+            "weight_decay": self.cfg.weight_decay,
+            "use_fp16_stats": self.cfg.fp16_adam_stats,
+        }
+
+
+class CPUAdam(torch.optim.Optimizer):
+
+    optimizer_id = 0
+
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        use_fp16_stats=False,
+    ):
+        defaults = {
+            "lr": lr,
+            "bias_correction": bias_correction,
+            "betas": betas,
+            "eps": eps,
+            "weight_decay": weight_decay,
+        }
+        super().__init__(params, defaults)
+
+        self.use_fp16_stats = use_fp16_stats
+        self.FLOAT16_MAX = 65504.0
+
+        if not has_deepspeed:
+            raise ImportError("Please install DeepSpeed: pip install deepspeed")
+
+        self.opt_id = CPUAdam.optimizer_id
+        CPUAdam.optimizer_id = CPUAdam.optimizer_id + 1
+
+        self.ds_opt_adam = _get_cpu_adam()
+        adamw_mode = True
+        self.ds_opt_adam.create_adam(
+            self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode
+        )
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        torch.cuda.synchronize()
+
+        for group_id, group in enumerate(self.param_groups):
+            for param_id, p in enumerate(group["params"]):
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state["step"] = 0
+                    dtype = torch.float16 if self.use_fp16_stats else p.data.dtype
+                    # gradient momentums
+                    state["exp_avg"] = torch.zeros_like(
+                        p.data, dtype=dtype, device="cpu"
+                    )
+                    # gradient variances
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p.data, dtype=dtype, device="cpu"
+                    )
+                    if self.use_fp16_stats:
+                        assert torch.is_floating_point(p.data)
+                        state["exp_avg_scale"] = 1.0
+                        state["exp_avg_sq_scale"] = 1.0
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+
+                p_data_bak = p.data  # backup of the original data pointer
+
+                p.data = p.data.to(dtype=torch.float32, device="cpu")
+                p.grad.data = p.grad.data.to(dtype=torch.float32, device="cpu")
+
+                if self.use_fp16_stats:
+                    exp_avg = exp_avg.float() * state["exp_avg_scale"]
+                    exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"]
+
+                state["step"] += 1
+                beta1, beta2 = group["betas"]
+
+                self.ds_opt_adam.adam_update(
+                    self.opt_id,
+                    state["step"],
+                    group["lr"],
+                    beta1,
+                    beta2,
+                    group["eps"],
+                    group["weight_decay"],
+                    group["bias_correction"],
+                    p.data,
+                    p.grad.data,
+                    exp_avg,
+                    exp_avg_sq,
+                )
+
+                if p_data_bak.data_ptr() != p.data.data_ptr():
+                    p_data_bak.copy_(p.data)
+                    p.data = p_data_bak
+
+                if self.use_fp16_stats:
+
+                    def inf_norm(t):
+                        return torch.norm(t, float("inf"))
+
+                    # from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py
+                    state["exp_avg_scale"], state["exp_avg_sq_scale"] = (
+                        1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX,
+                        1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX,
+                    )
+                    state["exp_avg"], state["exp_avg_sq"] = (
+                        (exp_avg / state["exp_avg_scale"]).half(),
+                        (exp_avg_sq / state["exp_avg_sq_scale"]).half(),
+                    )
+
+        return loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
new file mode 100644
index 0000000000..60c47b8db0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+class DynamicLossScaler(object):
+    def __init__(
+        self,
+        init_scale=2.0**15,
+        scale_factor=2.0,
+        scale_window=2000,
+        tolerance=0.0,
+        threshold=None,
+        min_loss_scale=1e-4,
+    ):
+        self.loss_scale = init_scale
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.tolerance = tolerance
+        self.threshold = threshold
+        self._iter = 0
+        self._last_overflow_iter = -1
+        self._last_rescale_iter = -1
+        self._overflows_since_rescale = 0
+        self.min_loss_scale = min_loss_scale
+
+    def scale(self, outputs):
+        return self.loss_scale * outputs
+
+    def update(self):
+        if (self._iter - self._last_overflow_iter) % self.scale_window == 0:
+            self.loss_scale *= self.scale_factor
+            self._last_rescale_iter = self._iter
+        self._iter += 1
+
+    def _decrease_loss_scale(self):
+        self.loss_scale /= self.scale_factor
+        if self.threshold is not None:
+            self.loss_scale = max(self.loss_scale, self.threshold)
+
+    def check_overflow(self, grad_norm):
+        # detect inf and nan
+        if grad_norm == float("inf") or grad_norm != grad_norm:
+            # overflow has occured
+            prev_scale = self.loss_scale
+            iter_since_rescale = self._iter - self._last_rescale_iter
+
+            self._last_overflow_iter = self._iter
+            self._overflows_since_rescale += 1
+            pct_overflow = self._overflows_since_rescale / float(iter_since_rescale)
+            if pct_overflow >= self.tolerance:
+                self._decrease_loss_scale()
+                self._last_rescale_iter = self._iter
+                self._overflows_since_rescale = 0
+
+            if self.loss_scale <= self.min_loss_scale:
+                # Use FloatingPointError as an uncommon error that parent
+                # functions can safely catch to stop training.
+                self.loss_scale = prev_scale
+                raise FloatingPointError(
+                    (
+                        "Minimum loss scale reached ({}). Your loss is probably exploding. "
+                        "Try lowering the learning rate, using gradient clipping or "
+                        "increasing the batch size."
+                    ).format(self.min_loss_scale)
+                )
+
+            self._iter += 1
+            raise OverflowError("setting loss scale to: " + str(self.loss_scale))
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
new file mode 100644
index 0000000000..73c7c695ee
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
@@ -0,0 +1,187 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq import utils
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from collections import defaultdict
+
+
+class FairseqOptimizer(object):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add optimizer-specific arguments to the parser."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            gen_parser_from_dataclass(parser, dc())
+
+    @property
+    def optimizer(self):
+        """Return a torch.optim.optimizer.Optimizer instance."""
+        if not hasattr(self, "_optimizer"):
+            raise NotImplementedError
+        if not isinstance(self._optimizer, torch.optim.Optimizer):
+            raise ValueError("_optimizer must be an instance of torch.optim.Optimizer")
+        return self._optimizer
+
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        """Reset optimizer instance."""
+        if not hasattr(self, "_optimizer"):
+            raise NotImplementedError
+        if not isinstance(self._optimizer, torch.optim.Optimizer):
+            raise ValueError("_optimizer must be an instance of torch.optim.Optimizer")
+        self._optimizer = optimizer
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        raise NotImplementedError
+
+    @property
+    def params(self):
+        """Return an iterable of the parameters held by the optimizer."""
+        for param_group in self.param_groups:
+            for p in param_group["params"]:
+                yield p
+
+    @property
+    def param_groups(self):
+        return self.optimizer.param_groups
+
+    def __getstate__(self):
+        return self._optimizer.__getstate__()
+
+    def get_lr(self):
+        """Return the current learning rate."""
+        return self.param_groups[0]["lr"]
+
+    def set_lr(self, lr):
+        """Set the learning rate."""
+        for param_group in self.param_groups:
+            param_group["lr"] = lr
+
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        return self.optimizer.state_dict()
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        self.optimizer.load_state_dict(state_dict)
+
+        if optimizer_overrides is not None and len(optimizer_overrides) > 0:
+            # override learning rate, momentum, etc. with latest values
+            for group in self.param_groups:
+                group.update(optimizer_overrides)
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves."""
+        loss.backward()
+
+    def all_reduce_grads(self, module):
+        """Manually all-reduce gradients (if required)."""
+        if hasattr(module, "all_reduce_grads"):
+            module.all_reduce_grads()
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
+        for p in self.params:
+            if p.grad is not None:
+                if p.grad.is_sparse:
+                    p.grad.data.mul_(c.to(p.grad.device) if torch.is_tensor(c) else c)
+                else:
+                    per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(
+                        p.grad.data
+                    )
+        for device, per_dtype_grads in per_device_and_dtype_grads.items():
+            for grads in per_dtype_grads.values():
+                torch._foreach_mul_(grads, c.to(device) if torch.is_tensor(c) else c)
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm."""
+        return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
+
+    def step(self, closure=None, scale=1.0, groups=None):
+        """Performs a single optimization step."""
+        if self.supports_step_with_scale:
+            if self.supports_groups:
+                self.optimizer.step(closure, scale=scale, groups=groups)
+            else:
+                self.optimizer.step(closure, scale=scale)
+        else:
+            if scale != 1.0:
+                self.multiply_grads(1.0 / scale)
+            if self.supports_groups:
+                self.optimizer.step(closure, groups=groups)
+            else:
+                self.optimizer.step(closure)
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        for p in self.params:
+            p.grad = None
+        self.optimizer.zero_grad()
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        if hasattr(self.optimizer, "supports_memory_efficient_fp16"):
+            return self.optimizer.supports_memory_efficient_fp16
+        return False
+
+    @property
+    def supports_step_with_scale(self):
+        if hasattr(self.optimizer, "supports_step_with_scale"):
+            return self.optimizer.supports_step_with_scale
+        return False
+
+    @property
+    def supports_groups(self):
+        if hasattr(self.optimizer, "supports_groups"):
+            return self.optimizer.supports_groups
+        return False
+
+    @property
+    def supports_flat_params(self):
+        """
+        Whether the optimizer supports collapsing of the model
+        parameters/gradients into a single contiguous Tensor.
+        """
+        if hasattr(self.optimizer, "supports_flat_params"):
+            return self.optimizer.supports_flat_params
+        return False
+
+    def average_params(self):
+        pass
+
+    def broadcast_global_state_dict(self, state_dict):
+        """
+        Broadcasts a global state dict to all ranks.
+        Useful for optimizers that shard state between ranks.
+        """
+        if hasattr(self.optimizer, "broadcast_global_state_dict"):
+            return self.optimizer.broadcast_global_state_dict(state_dict)
+        else:
+            return state_dict
+
+
+class LegacyFairseqOptimizer(FairseqOptimizer):
+    def __init__(self, args):
+        self.args = args
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
new file mode 100644
index 0000000000..6a4da342ca
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
@@ -0,0 +1,558 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+from itertools import chain
+
+import torch
+from omegaconf import DictConfig
+
+from fairseq import optim
+
+from .dynamic_loss_scaler import DynamicLossScaler
+
+
+class _FP16OptimizerMixin(object):
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in mro(method resolution order)
+        super().__init__(*args, **kwargs)
+        self._multiply_factor = 1.0
+
+    @property
+    def has_flat_params(self):
+        return torch.is_tensor(self.fp32_params) or (
+            isinstance(self.fp32_params, dict)
+            and all(torch.is_tensor(t) for t in self.fp32_params.values())
+        )
+
+    @classmethod
+    def build_fp32_params(cls, args, params, flatten=True):
+        # create FP32 copy of parameters and grads
+        if flatten:
+            is_pipeline_parallel = getattr(
+                args, "pipeline_model_parallel", False
+            ) and getattr(args, "distributed_no_spawn", False)
+            total_param_size = sum(p.data.numel() for p in params)
+            devices = [torch.cuda.current_device()]
+            if is_pipeline_parallel:
+                devices = list(set(args.pipeline_devices))
+            fp32_params = {}
+            for device in devices:
+                if is_pipeline_parallel:
+                    device_param_size = sum(
+                        p.data.numel() for p in params if p.device.index == device
+                    )
+                    device_params = [p for p in params if p.device.index == device]
+                else:
+                    device_param_size = total_param_size
+                    device_params = params
+                fp32_params[device] = (
+                    device_params[0].new(0).float().new(device_param_size)
+                )
+                offset = 0
+                for p in device_params:
+                    numel = p.data.numel()
+                    fp32_params[device][offset : offset + numel].copy_(p.data.view(-1))
+                    offset += numel
+                fp32_params[device] = torch.nn.Parameter(fp32_params[device])
+                fp32_params[device].grad = fp32_params[device].data.new(
+                    device_param_size
+                )
+            return fp32_params
+        else:
+            fp32_params = []
+            for p in params:
+                p32 = torch.nn.Parameter(p.data.float())
+                if hasattr(p, "expert"):
+                    p32.expert = True
+                elif hasattr(p, "base_expert"):
+                    p32.base_expert = True
+                p32.grad = torch.zeros_like(p32.data)
+                if hasattr(p, "param_group"):
+                    p32.param_group = p.param_group
+                if hasattr(p, "optim_overrides"):
+                    p32.optim_overrides = p.optim_overrides
+                fp32_params.append(p32)
+            return fp32_params
+
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        state_dict = self.fp32_optimizer.state_dict()
+        if self.scaler is not None:
+            state_dict["loss_scale"] = self.scaler.loss_scale
+        return state_dict
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        if "loss_scale" in state_dict and self.scaler is not None:
+            self.scaler.loss_scale = state_dict["loss_scale"]
+        self.fp32_optimizer.load_state_dict(state_dict, optimizer_overrides)
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        if self.scaler is not None:
+            loss = self.scaler.scale(loss)
+        loss.backward()
+        self._needs_sync = True
+
+    def _sync_fp16_grads_to_fp32(self):
+        if self._needs_sync:
+            # copy FP16 grads to FP32
+            if self.has_flat_params:
+                devices = list(self.fp32_params.keys())
+                device_params_dict = defaultdict(list)
+                for p in self.fp16_params:
+                    if p.requires_grad:
+                        device_params_dict[p.device.index].append(p)
+                for device in devices:
+                    device_params = device_params_dict[device]
+                    offset = 0
+                    for p in device_params:
+                        grad_data = (
+                            p.grad.data
+                            if p.grad is not None
+                            else p.data.new_zeros(p.data.shape)
+                        )
+                        numel = grad_data.numel()
+                        self.fp32_params[device].grad.data[
+                            offset : offset + numel
+                        ].copy_(grad_data.view(-1))
+                        offset += numel
+            else:
+                for p, p32 in zip(self.fp16_params, self.fp32_params):
+                    if not p.requires_grad:
+                        continue
+                    if p.grad is not None:
+                        if p32.grad is None:
+                            p32.grad = p.grad.data.float()
+                        else:
+                            p32.grad.data.copy_(p.grad.data)
+                    else:
+                        p32.grad = torch.zeros_like(p.data, dtype=torch.float)
+
+            self._needs_sync = False
+
+    def _sync_fp32_params_to_fp16(self):
+        # copy FP32 params back into FP16 model
+        if self.has_flat_params:
+            devices = list(self.fp32_params.keys())
+            device_params_dict = defaultdict(list)
+            for p in self.fp16_params:
+                device_params_dict[p.device.index].append(p)
+            for device in devices:
+                device_params = device_params_dict[device]
+                offset = 0
+                for p in device_params:
+                    numel = p.data.numel()
+                    p.data.copy_(
+                        self.fp32_params[device]
+                        .data[offset : offset + numel]
+                        .view_as(p.data)
+                    )
+                    offset += numel
+        else:
+            for p, p32 in zip(self.fp16_params, self.fp32_params):
+                if not p.requires_grad:
+                    continue
+                p.data.copy_(p32.data)
+
+    def _unscale_grads(self):
+        self._sync_fp16_grads_to_fp32()
+        if (
+            # Skip the multiplication if it's a no-op (i.e., if _multiply_factor
+            # is 1.0). At the same time, we want to avoid the device-to-host
+            # transfer by comparing it to 1.0. Since _multiply_factor starts as
+            # a Python float, we roughly assume that if it's a tensor then it's
+            # probably not =1.0 anymore and we do the multiplication. Otherwise
+            # we can safely check the value without a D2H transfer.
+            torch.is_tensor(self._multiply_factor)
+            or self._multiply_factor != 1.0
+        ):
+            self.fp32_optimizer.multiply_grads(self._multiply_factor)
+            self._multiply_factor = 1.0
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant ``c``."""
+        self._multiply_factor *= c
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm and updates dynamic loss scaler."""
+        self._sync_fp16_grads_to_fp32()
+
+        grad_norm = self._multiply_factor * self.fp32_optimizer.clip_grad_norm(
+            0, aggregate_norm_fn
+        )
+
+        if torch.is_tensor(self._multiply_factor):
+            self._multiply_factor = self._multiply_factor.to(grad_norm.device)
+
+        if self.scaler is not None:
+            if grad_norm > max_norm > 0.0:
+                self._multiply_factor *= max_norm / grad_norm
+
+            self.scaler.check_overflow(grad_norm)
+        elif max_norm > 0.0:
+            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
+            self._multiply_factor *= clip_coef
+
+        return grad_norm
+
+    def step(self, closure=None, groups=None):
+        """Performs a single optimization step."""
+        self._sync_fp16_grads_to_fp32()
+
+        if getattr(self, "supports_step_with_scale", False):
+            self.fp32_optimizer.step(
+                closure, scale=(1.0 / self._multiply_factor), groups=groups
+            )
+        else:
+            self._unscale_grads()
+            self.fp32_optimizer.step(closure, groups=groups)
+
+        if self.scaler is not None:
+            self.scaler.update()
+
+        self._sync_fp32_params_to_fp16()
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        for p in self.fp16_params:
+            p.grad = None
+        if self.has_flat_params:
+            if torch.is_tensor(self.fp32_params):
+                self.fp32_params.grad.zero_()
+            elif isinstance(self.fp32_params, dict):
+                for fp32_params in self.fp32_params.values():
+                    fp32_params.grad.zero_()
+            else:
+                raise RuntimeError("self.fp32_params must be a tensor or dict")
+        else:
+            for p32 in self.fp32_params:
+                if p32.grad is not None:
+                    p32.grad.zero_()
+        self._needs_sync = False
+
+        if self.scaler is not None:
+            self._multiply_factor = 1.0 / float(self.scaler.loss_scale)
+
+
+class FP16Optimizer(_FP16OptimizerMixin, optim.FairseqOptimizer):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+    """
+
+    def __init__(self, cfg: DictConfig, params, fp32_optimizer, fp32_params, **kwargs):
+        super().__init__(cfg.optimizer)
+        self.fp16_params = params
+        self.fp32_optimizer = fp32_optimizer
+        self.fp32_params = fp32_params
+
+        if getattr(cfg.common, "fp16_scale_window", None) is None:
+            if len(cfg.optimization.update_freq) > 1:
+                raise ValueError(
+                    "--fp16-scale-window must be given explicitly when using a "
+                    "custom --update-freq schedule"
+                )
+            data_parallel_size = int(
+                cfg.distributed_training.distributed_world_size
+                / cfg.common.model_parallel_size
+            )
+            scale_window = int(
+                2**14 / data_parallel_size / cfg.optimization.update_freq[0]
+            )
+        else:
+            scale_window = cfg.common.fp16_scale_window
+
+        if not getattr(cfg.common, "bf16", False):
+            self.scaler = DynamicLossScaler(
+                init_scale=cfg.common.fp16_init_scale,
+                scale_window=scale_window,
+                tolerance=cfg.common.fp16_scale_tolerance,
+                threshold=cfg.common.threshold_loss_scale,
+                min_loss_scale=cfg.common.min_loss_scale,
+            )
+        else:
+            # disable loss scaling for bfloat16
+            self.scaler = None
+
+    @classmethod
+    def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
+        """
+        Args:
+            cfg (omegaconf.DictConfig): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        flatten = not getattr(cfg.common, "fp16_no_flatten_grads", False)
+        if getattr(cfg.common, "bf16", False):
+            flatten = False  # mixed precision is faster on TPUs without flat grads
+        fp32_params = cls.build_fp32_params(cfg.optimizer, params, flatten=flatten)
+        if flatten:
+            fp32_optimizer = optim.build_optimizer(cfg.optimizer, [fp32_params])
+        else:
+            fp32_optimizer = optim.build_optimizer(cfg.optimizer, fp32_params)
+        if flatten and not fp32_optimizer.supports_flat_params:
+            raise RuntimeError(
+                f"chosen optimizer {fp32_optimizer.__class__.__name__} does not support flat params, please set --fp16-no-flatten-grads"
+            )
+        return cls(cfg, params, fp32_optimizer, fp32_params, **kwargs)
+
+    @property
+    def optimizer(self):
+        return self.fp32_optimizer.optimizer
+
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        self.fp32_optimizer.optimizer = optimizer
+
+    @property
+    def lr_scheduler(self):
+        return getattr(self.fp32_optimizer, "lr_scheduler", None)
+
+    @property
+    def optimizer_config(self):
+        return self.fp32_optimizer.optimizer_config
+
+    def get_lr(self):
+        return self.fp32_optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self.fp32_optimizer.set_lr(lr)
+
+    def all_reduce_grads(self, module):
+        self.fp32_optimizer.all_reduce_grads(module)
+
+    @property
+    def supports_flat_params(self):
+        return self.fp32_optimizer.supports_flat_params
+
+
+class _MemoryEfficientFP16OptimizerMixin(object):
+    def __init__(self, *args, **kwargs):
+        # forward __init__ call to the next class in MRO (method resolution order)
+        super().__init__(*args, **kwargs)
+        self._multiply_factor = 1.0
+
+    @property
+    def has_flat_params(self):
+        return False
+
+    def state_dict(self):
+        """Return the optimizer's state dict."""
+        state_dict = self.wrapped_optimizer.state_dict()
+        if self.scaler is not None:
+            state_dict["loss_scale"] = self.scaler.loss_scale
+        return state_dict
+
+    def load_state_dict(self, state_dict, optimizer_overrides=None):
+        """Load an optimizer state dict.
+
+        In general we should prefer the configuration of the existing optimizer
+        instance (e.g., learning rate) over that found in the state_dict. This
+        allows us to resume training from a checkpoint using a new set of
+        optimizer args.
+        """
+        if "loss_scale" in state_dict and self.scaler is not None:
+            self.scaler.loss_scale = state_dict["loss_scale"]
+
+        self.wrapped_optimizer.load_state_dict(state_dict, optimizer_overrides)
+
+        # Hack: PyTorch automatically casts the optimizer state to match the
+        # type of the current parameters. But with --memory-efficient-fp16 the
+        # params are FP16 while the optimizer state is FP32 and we don't want
+        # to cast. A workaround is to manually copy back the original state
+        # after the optimizer has been loaded.
+        if not getattr(self.optimizer, "disable_mem_eff_fp16_loading_hack", False):
+            groups = self.optimizer.param_groups
+            saved_groups = state_dict["param_groups"]
+            id_map = {
+                old_id: p
+                for old_id, p in zip(
+                    chain(*(g["params"] for g in saved_groups)),
+                    chain(*(g["params"] for g in groups)),
+                )
+            }
+            for k, v in state_dict["state"].items():
+                if k in id_map:
+                    param = id_map[k]
+                    self.optimizer.state[param] = v
+
+    def backward(self, loss):
+        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
+
+        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
+        function additionally dynamically scales the loss to avoid gradient
+        underflow.
+        """
+        if self.scaler is not None:
+            loss = self.scaler.scale(loss)
+        loss.backward()
+
+    def _unscale_grads(self):
+        if (
+            # Skip the multiplication if it's a no-op (i.e., if _multiply_factor
+            # is 1.0). At the same time, we want to avoid the device-to-host
+            # transfer by comparing it to 1.0. Since _multiply_factor starts as
+            # a Python float, we roughly assume that if it's a tensor then it's
+            # probably not =1.0 anymore and we do the multiplication. Otherwise
+            # we can safely check the value without a D2H transfer.
+            torch.is_tensor(self._multiply_factor)
+            or self._multiply_factor != 1.0
+        ):
+            self.wrapped_optimizer.multiply_grads(self._multiply_factor)
+            self._multiply_factor = 1.0
+
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        self._multiply_factor *= c
+
+    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
+        """Clips gradient norm and updates dynamic loss scaler."""
+        max_norm = float(max_norm)
+        grad_norm = self._multiply_factor * self.wrapped_optimizer.clip_grad_norm(
+            0, aggregate_norm_fn
+        )
+
+        if self.scaler is not None:
+            grad_norm_cpu = float(grad_norm)
+            if grad_norm_cpu > max_norm > 0.0:
+                self._multiply_factor *= max_norm / grad_norm_cpu
+
+            # detect overflow and adjust loss scale
+            self.scaler.check_overflow(grad_norm_cpu)
+        elif max_norm > 0.0:
+            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
+            self._multiply_factor *= clip_coef
+
+        return grad_norm
+
+    def step(self, closure=None, groups=None):
+        """Performs a single optimization step."""
+        if getattr(self, "supports_step_with_scale", False):
+            # NOTE(msb) optimizer divides by scale factor
+            self.wrapped_optimizer.step(
+                closure, scale=(1.0 / self._multiply_factor), groups=groups
+            )
+        else:
+            self._unscale_grads()
+            self.wrapped_optimizer.step(closure, groups=groups)
+
+        if self.scaler is not None:
+            self.scaler.update()
+
+    def zero_grad(self):
+        """Clears the gradients of all optimized parameters."""
+        self.wrapped_optimizer.zero_grad()
+        if self.scaler is not None:
+            self._multiply_factor = 1.0 / float(self.scaler.loss_scale)
+        else:
+            self._multiply_factor = 1.0
+
+    @property
+    def supports_flat_params(self):
+        return self.wrapped_optimizer.supports_flat_params
+
+
+class MemoryEfficientFP16Optimizer(
+    _MemoryEfficientFP16OptimizerMixin, optim.FairseqOptimizer
+):
+    """
+    Wrap an *optimizer* to support FP16 (mixed precision) training.
+
+    Compared to :class:`fairseq.optim.FP16Optimizer`, this version does not
+    maintain an FP32 copy of the model. We instead expect the optimizer to
+    convert the gradients to FP32 internally and sync the results back to the
+    FP16 model params. This significantly reduces memory usage but slightly
+    increases the time spent in the optimizer.
+
+    Since this wrapper depends on specific functionality in the wrapped
+    optimizer (i.e., on-the-fly conversion of grads to FP32), only certain
+    optimizers can be wrapped. This is determined by the
+    *supports_memory_efficient_fp16* property.
+    """
+
+    def __init__(
+        self, cfg: DictConfig, params, optimizer, allow_unsupported=False, **kwargs
+    ):
+        if not allow_unsupported and not optimizer.supports_memory_efficient_fp16:
+            raise ValueError(
+                "Unsupported optimizer: {}".format(optimizer.__class__.__name__)
+            )
+
+        super().__init__(getattr(cfg, "optimizer", None))
+        self.wrapped_optimizer = optimizer
+
+        if getattr(cfg.common, "fp16_scale_window", None) is None:
+            if len(cfg.optimization.update_freq) > 1:
+                raise ValueError(
+                    "--fp16-scale-window must be given explicitly when using a "
+                    "custom --update-freq schedule"
+                )
+            data_parallel_size = int(
+                cfg.distributed_training.distributed_world_size
+                / cfg.common.model_parallel_size
+            )
+            scale_window = int(
+                2**14 / data_parallel_size / cfg.optimization.update_freq[0]
+            )
+        else:
+            scale_window = cfg.common.fp16_scale_window
+
+        if not getattr(cfg.common, "bf16", False):
+            self.scaler = DynamicLossScaler(
+                init_scale=cfg.common.fp16_init_scale,
+                scale_window=scale_window,
+                tolerance=cfg.common.fp16_scale_tolerance,
+                threshold=cfg.common.threshold_loss_scale,
+                min_loss_scale=cfg.common.min_loss_scale,
+            )
+        else:
+            # disable loss scaling for bfloat16
+            self.scaler = None
+
+    @classmethod
+    def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
+        """
+        Args:
+            args (argparse.Namespace): fairseq args
+            params (iterable): iterable of parameters to optimize
+        """
+        fp16_optimizer = optim.build_optimizer(cfg.optimizer, params)
+        return cls(cfg, params, fp16_optimizer, **kwargs)
+
+    @property
+    def optimizer(self):
+        return self.wrapped_optimizer.optimizer
+
+    @optimizer.setter
+    def optimizer(self, optimizer):
+        self.wrapped_optimizer.optimizer = optimizer
+
+    @property
+    def optimizer_config(self):
+        return self.wrapped_optimizer.optimizer_config
+
+    @property
+    def lr_scheduler(self):
+        return getattr(self.wrapped_optimizer, "lr_scheduler", None)
+
+    def get_lr(self):
+        return self.wrapped_optimizer.get_lr()
+
+    def set_lr(self, lr):
+        self.wrapped_optimizer.set_lr(lr)
+
+    def all_reduce_grads(self, module):
+        self.wrapped_optimizer.all_reduce_grads(module)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
new file mode 100644
index 0000000000..39a2a83694
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
@@ -0,0 +1,389 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import types
+
+import torch
+
+
+def get_fused_adam_class():
+    """
+    Look for the FusedAdam optimizer from apex. We first try to load the
+    "contrib" interface, which is a bit faster than the main interface,
+    but is technically deprecated.
+    """
+    try:
+        # The "deprecated" interface in recent versions of apex is a bit
+        # faster than the main interface, since we don't use the apex
+        # optimizer. This can be installed by passing the
+        # `--deprecated_fused_adam` option when building apex.
+        global fused_adam_cuda
+        import importlib
+
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+        return FusedAdamV1
+    except ImportError:
+        try:
+            # fallback to the newer interface
+            from apex.multi_tensor_apply import multi_tensor_applier
+            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
+
+            if multi_tensor_applier.available:
+                return FusedAdamV2
+        except ImportError:
+            pass
+    return None
+
+
+class FusedAdamV1(torch.optim.Optimizer):
+    """
+    Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Compared to the original version in Apex, the fairseq version casts grads
+    and params to FP32 internally to support ``--memory-efficient-fp16``.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+    .. _Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        eps_inside_sqrt=False,
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        amsgrad=False,
+        use_fp16_stats=False,
+    ):
+        global fused_adam_cuda
+        import importlib
+
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+
+        if amsgrad:
+            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
+        defaults = {
+            "lr": lr,
+            "bias_correction": bias_correction,
+            "betas": betas,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "max_grad_norm": max_grad_norm,
+        }
+        super().__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+
+        self.use_fp16_stats = use_fp16_stats
+        self.FLOAT16_MAX = 65504.0
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    @property
+    def supports_step_with_scale(self):
+        return True
+
+    def step(self, closure=None, grads=None, scale=1.0, grad_norms=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced precision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        if grad_norms is None:
+            grad_norms = [None] * len(self.param_groups)
+
+        for group, grads_this_group, grad_norm in zip(
+            self.param_groups, grads_group, grad_norms
+        ):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group["params"])
+
+            # compute combined scale factor for this group
+            combined_scale = scale
+            if group.get("max_grad_norm", 0) > 0:
+                # norm is in fact norm*scale
+                clip = ((grad_norm / scale) + 1e-6) / group["max_grad_norm"]
+                if clip > 1:
+                    combined_scale = clip * scale
+
+            bias_correction = 1 if group.get("bias_correction", 1) else 0
+
+            for p, grad in zip(group["params"], grads_this_group):
+                # note: p.grad should not ever be set for correct
+                # operation of mixed precision optimizer that sometimes
+                # sends None gradients
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "FusedAdam does not support sparse gradients, "
+                        "please consider SparseAdam instead"
+                    )
+
+                if p.device.type == "cpu":
+                    p_data_fp32 = p.data.cuda(non_blocking=True).float()
+                    out_p = torch.tensor([], dtype=torch.float)
+                else:
+                    p_data_fp32 = p.data.float()
+                    out_p = p.data
+
+                state = self.state[p]
+
+                # State initialization
+                dtype = torch.float16 if self.use_fp16_stats else p_data_fp32.dtype
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32, dtype=dtype)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32, dtype=dtype)
+                    if self.use_fp16_stats:
+                        state["exp_avg_scale"] = 1.0
+                        state["exp_avg_sq_scale"] = 1.0
+                else:
+                    device = p_data_fp32.device
+                    state["exp_avg"] = state["exp_avg"].to(device, dtype)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].to(device, dtype)
+
+                exp_avg = state["exp_avg"]
+                exp_avg_sq = state["exp_avg_sq"]
+                if self.use_fp16_stats:
+                    assert exp_avg.dtype == torch.float16
+                    exp_avg = exp_avg.float() * state["exp_avg_scale"]
+                    exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"]
+                beta1, beta2 = group["betas"]
+
+                if "step" not in state:
+                    state["step"] = group["step"]
+
+                state["step"] += 1
+
+                with torch.cuda.device(p_data_fp32.device):
+                    fused_adam_cuda.adam(
+                        p_data_fp32,
+                        out_p,
+                        exp_avg,
+                        exp_avg_sq,
+                        grad,
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        combined_scale,
+                        state["step"],
+                        self.eps_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                    )
+
+                if p.device.type == "cpu":
+                    p.data.copy_(p_data_fp32, non_blocking=True)
+
+                if self.use_fp16_stats:
+
+                    def inf_norm(t):
+                        return torch.norm(t, float("inf"))
+
+                    # from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py
+                    state["exp_avg_scale"], state["exp_avg_sq_scale"] = (
+                        1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX,
+                        1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX,
+                    )
+                    state["exp_avg"], state["exp_avg_sq"] = (
+                        (exp_avg / state["exp_avg_scale"]).half(),
+                        (exp_avg_sq / state["exp_avg_sq_scale"]).half(),
+                    )
+
+        return loss
+
+
+try:
+    from apex.multi_tensor_apply import multi_tensor_applier
+    from apex.optimizers import FusedAdam
+
+    class FusedAdamV2(FusedAdam):
+        """
+        Compared to the original version in Apex, the fairseq version casts grads
+        and params to FP32 internally to support ``--memory-efficient-fp16``.
+        """
+
+        def __init__(self, *args, use_fp16_stats=False, **kwargs):
+            if use_fp16_stats:
+                raise NotImplementedError(
+                    "--fp16-adam-stats is only supported with FusedAdamV1"
+                )
+            super().__init__(*args, **kwargs)
+            if not hasattr(self, "multi_tensor_adam"):
+                raise Exception(
+                    "Apex installation is outdated. Please install an updated version of apex."
+                )
+
+        @property
+        def supports_memory_efficient_fp16(self):
+            return True
+
+        @property
+        def supports_flat_params(self):
+            return True
+
+        def step(
+            self,
+            closure=None,
+            grads=None,
+            output_params=None,
+            scale=None,
+            grad_norms=None,
+        ):
+            """Performs a single optimization step."""
+            loss = None
+            if closure is not None:
+                loss = closure()
+
+            for group in self.param_groups:
+                bias_correction = 1 if group["bias_correction"] else 0
+                beta1, beta2 = group["betas"]
+
+                # assume same step across group now to simplify things
+                # per parameter step can be easily support by making it tensor, or pass list into kernel
+                if "step" in group:
+                    group["step"] += 1
+                else:
+                    group["step"] = 1
+
+                # create lists for multi-tensor apply
+                g_16, p_16, orig_p_16, m_16, v_16 = [], [], [], [], []
+                g_32, p_32, m_32, v_32 = [], [], [], []
+
+                for p in group["params"]:
+                    if p.grad is None:
+                        continue
+                    if p.grad.data.is_sparse:
+                        raise RuntimeError(
+                            "FusedAdam does not support sparse gradients, "
+                            "please consider SparseAdam instead"
+                        )
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        # Exponential moving average of gradient values
+                        state["exp_avg"] = torch.zeros_like(p.data, dtype=torch.float)
+                        # Exponential moving average of squared gradient values
+                        state["exp_avg_sq"] = torch.zeros_like(
+                            p.data, dtype=torch.float
+                        )
+                    else:
+                        state["exp_avg"] = state["exp_avg"].to(
+                            device=p.data.device, dtype=torch.float
+                        )
+                        state["exp_avg_sq"] = state["exp_avg_sq"].to(
+                            device=p.data.device, dtype=torch.float
+                        )
+
+                    if p.dtype == torch.float16:
+                        g_16.append(p.grad.data.float())
+                        p_16.append(p.data.float())
+                        orig_p_16.append(p.data)
+                        m_16.append(state["exp_avg"])
+                        v_16.append(state["exp_avg_sq"])
+                    elif p.dtype == torch.float32:
+                        g_32.append(p.grad.data)
+                        p_32.append(p.data)
+                        m_32.append(state["exp_avg"])
+                        v_32.append(state["exp_avg_sq"])
+                    else:
+                        raise RuntimeError("FusedAdam only support fp16 and fp32.")
+
+                with torch.cuda.device(p.device):
+                    if len(g_16) > 0:
+                        multi_tensor_applier(
+                            self.multi_tensor_adam,
+                            self._dummy_overflow_buf,
+                            [g_16, p_16, m_16, v_16],
+                            group["lr"],
+                            beta1,
+                            beta2,
+                            group["eps"],
+                            group["step"],
+                            self.adam_w_mode,
+                            bias_correction,
+                            group["weight_decay"],
+                        )
+                        for orig_p, p in zip(orig_p_16, p_16):
+                            orig_p.copy_(p.data)
+                    if len(g_32) > 0:
+                        multi_tensor_applier(
+                            self.multi_tensor_adam,
+                            self._dummy_overflow_buf,
+                            [g_32, p_32, m_32, v_32],
+                            group["lr"],
+                            beta1,
+                            beta2,
+                            group["eps"],
+                            group["step"],
+                            self.adam_w_mode,
+                            bias_correction,
+                            group["weight_decay"],
+                        )
+
+            return loss
+
+except ImportError:
+    pass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
new file mode 100644
index 0000000000..f4f2bdb0c6
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.optim import LegacyFairseqOptimizer, register_optimizer
+
+
+@register_optimizer("lamb")
+class FairseqLAMB(LegacyFairseqOptimizer):
+    """LAMB optimizer."""
+
+    def __init__(self, args, params):
+        super().__init__(args)
+        try:
+            from apex.optimizers import FusedLAMB
+
+            self._optimizer = FusedLAMB(params, **self.optimizer_config)
+        except ImportError:
+            raise ImportError("Please install apex to use LAMB optimizer")
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--lamb-betas', default='(0.9, 0.999)', metavar='B',
+                            help='betas for LAMB optimizer')
+        parser.add_argument('--lamb-eps', type=float, default=1e-8, metavar='D',
+                            help='epsilon for LAMB optimizer')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "betas": eval(self.args.lamb_betas),
+            "eps": self.args.lamb_eps,
+            "weight_decay": self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return False
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
new file mode 100644
index 0000000000..5b3dbc023a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import importlib
+import os
+
+from fairseq import registry
+from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import (  # noqa
+    FairseqLRScheduler,
+    LegacyFairseqLRScheduler,
+)
+from omegaconf import DictConfig
+
+
+(
+    build_lr_scheduler_,
+    register_lr_scheduler,
+    LR_SCHEDULER_REGISTRY,
+    LR_SCHEDULER_DATACLASS_REGISTRY,
+) = registry.setup_registry(
+    "--lr-scheduler", base_class=FairseqLRScheduler, default="fixed"
+)
+
+
+def build_lr_scheduler(cfg: DictConfig, optimizer):
+    return build_lr_scheduler_(cfg, optimizer)
+
+
+# automatically import any Python files in the optim/lr_scheduler/ directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        file_name = file[: file.find(".py")]
+        importlib.import_module("fairseq.optim.lr_scheduler." + file_name)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
new file mode 100644
index 0000000000..5fcaea25d4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class CosineLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = field(
+        default=II("optimization.lr"),
+        metadata={"help": "max learning rate, must be more than cfg.min_lr"},
+    )
+    min_lr: float = field(default=0.0, metadata={"help": "min learning rate"})
+    t_mult: float = field(
+        default=1.0, metadata={"help": "factor to grow the length of each period"}
+    )
+    lr_period_updates: float = field(
+        default=-1, metadata={"help": "initial number of updates per period"}
+    )
+    lr_shrink: float = field(
+        default=0.1, metadata={"help": "shrink factor for annealing"}
+    )
+    # This is not required, but is for convenience in inferring lr_period_updates
+    max_update: int = II("optimization.max_update")
+
+
+@register_lr_scheduler("cosine", dataclass=CosineLRScheduleConfig)
+class CosineLRSchedule(FairseqLRScheduler):
+    """Assign LR based on a cyclical schedule that follows the cosine function.
+
+    See https://arxiv.org/pdf/1608.03983.pdf for details.
+
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    max learning rate (``--lr``).
+
+    During warmup::
+
+      lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates)
+      lr = lrs[update_num]
+
+    After warmup::
+
+      lr = cfg.min_lr + 0.5*(cfg.lr - cfg.min_lr)*(1 + cos(t_curr / t_i))
+
+    where ``t_curr`` is current percentage of updates within the current period
+    range and ``t_i`` is the current period range, which is scaled by ``t_mul``
+    after every iteration.
+    """
+
+    def __init__(self, cfg: CosineLRScheduleConfig, fairseq_optimizer):
+        super().__init__(cfg, fairseq_optimizer)
+        if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with cosine."
+                f" Consider --lr-scheduler=fixed instead. ({cfg.lr})"
+            )
+
+        self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr
+        if self.max_lr < cfg.min_lr:
+            cfg.min_lr = self.max_lr
+
+        warmup_end_lr = self.max_lr
+        if cfg.warmup_init_lr < 0:
+            cfg.warmup_init_lr = cfg.min_lr
+
+        self.t_mult = cfg.t_mult
+        self.period = cfg.lr_period_updates
+
+        if self.period <= 0:
+            assert (
+                cfg.max_update > 0
+            ), "Either --max_update or --lr-period-updates must be set"
+            self.period = cfg.max_update - cfg.warmup_updates
+
+        if cfg.warmup_updates > 0:
+            # linearly warmup for the first cfg.warmup_updates
+            self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates
+        else:
+            self.lr_step = 1
+
+        self.warmup_updates = cfg.warmup_updates
+        self.lr_shrink = cfg.lr_shrink
+
+        # initial learning rate
+        self.lr = cfg.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.cfg.warmup_updates:
+            self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step
+        else:
+            curr_updates = num_updates - self.cfg.warmup_updates
+            if self.t_mult != 1:
+                i = math.floor(
+                    math.log(
+                        1 - curr_updates / self.period * (1 - self.t_mult), self.t_mult
+                    )
+                )
+                t_i = self.t_mult**i * self.period
+                t_curr = (
+                    curr_updates
+                    - (1 - self.t_mult**i) / (1 - self.t_mult) * self.period
+                )
+            else:
+                i = math.floor(curr_updates / self.period)
+                t_i = self.period
+                t_curr = curr_updates - (self.period * i)
+
+            lr_shrink = self.lr_shrink**i
+            min_lr = self.cfg.min_lr * lr_shrink
+            max_lr = self.max_lr * lr_shrink
+
+            self.lr = min_lr + 0.5 * (max_lr - min_lr) * (
+                1 + math.cos(math.pi * t_curr / t_i)
+            )
+
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
new file mode 100644
index 0000000000..6c12fa56b8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
@@ -0,0 +1,59 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from argparse import Namespace
+
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.optim import FairseqOptimizer
+
+
+class FairseqLRScheduler(object):
+    def __init__(self, cfg, optimizer):
+        super().__init__()
+        if optimizer is not None and not isinstance(optimizer, FairseqOptimizer):
+            raise ValueError("optimizer must be an instance of FairseqOptimizer")
+        self.cfg = cfg
+        self.optimizer = optimizer
+        self.best = None
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add arguments to the parser for this LR scheduler."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            gen_parser_from_dataclass(parser, dc())
+
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {"best": self.best}
+
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        self.best = state_dict["best"]
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        pass
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        if val_loss is not None:
+            if self.best is None:
+                self.best = val_loss
+            else:
+                self.best = min(self.best, val_loss)
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return self.optimizer.get_lr()
+
+
+class LegacyFairseqLRScheduler(FairseqLRScheduler):
+    def __init__(self, args: Namespace, optimizer):
+        if not isinstance(optimizer, FairseqOptimizer):
+            raise ValueError("optimizer must be an instance of FairseqOptimizer")
+        self.args = args
+        self.optimizer = optimizer
+        self.best = None
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
new file mode 100644
index 0000000000..d0e7e14b7e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
@@ -0,0 +1,76 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import Optional, List
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class FixedLRScheduleConfig(FairseqDataclass):
+    force_anneal: Optional[int] = field(
+        default=None,
+        metadata={"help": "force annealing at specified epoch"},
+    )
+    lr_shrink: float = field(
+        default=0.1,
+        metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"},
+    )
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    lr: List[float] = II("optimization.lr")
+
+
+@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig)
+class FixedLRSchedule(FairseqLRScheduler):
+    """Decay the LR on a fixed schedule."""
+
+    def __init__(self, cfg: FixedLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+
+        self.lr = cfg.lr[0]
+        if cfg.warmup_updates > 0:
+            self.warmup_factor = 1.0 / cfg.warmup_updates
+        else:
+            self.warmup_factor = 1
+
+    def state_dict(self):
+        return {"lr": self.lr}
+
+    def load_state_dict(self, state_dict):
+        if "lr" in state_dict:
+            self.lr = state_dict["lr"]
+
+    def get_next_lr(self, epoch):
+        lrs = self.cfg.lr
+        if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = lrs[-1] * self.cfg.lr_shrink ** (
+                epoch + 1 - self.cfg.force_anneal
+            )
+        return next_lr
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates:
+            self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates)
+            self.optimizer.set_lr(self.warmup_factor * self.lr)
+        else:
+            self.optimizer.set_lr(self.lr)
+        return self.optimizer.get_lr()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
new file mode 100644
index 0000000000..987c905a23
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class InverseSquareRootLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=4000,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = II("optimization.lr")
+
+
+@register_lr_scheduler("inverse_sqrt", dataclass=InverseSquareRootLRScheduleConfig)
+class InverseSquareRootSchedule(FairseqLRScheduler):
+    """Decay the LR based on the inverse square root of the update number.
+
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate (``--warmup-init-lr``) until the configured
+    learning rate (``--lr``). Thereafter we decay proportional to the number of
+    updates, with a decay factor set to align with the configured learning rate.
+
+    During warmup::
+
+      lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates)
+      lr = lrs[update_num]
+
+    After warmup::
+
+      decay_factor = cfg.lr * sqrt(cfg.warmup_updates)
+      lr = decay_factor / sqrt(update_num)
+    """
+
+    def __init__(self, cfg: InverseSquareRootLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with inverse_sqrt."
+                " Consider --lr-scheduler=fixed instead."
+            )
+        warmup_end_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr
+        if cfg.warmup_init_lr < 0:
+            cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr
+
+        # linearly warmup for the first cfg.warmup_updates
+        self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates
+
+        # then, decay prop. to the inverse square root of the update number
+        self.decay_factor = warmup_end_lr * cfg.warmup_updates**0.5
+
+        # initial learning rate
+        self.lr = cfg.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.cfg.warmup_updates:
+            self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step
+        else:
+            self.lr = self.decay_factor * num_updates**-0.5
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
new file mode 100644
index 0000000000..57edc256fd
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import LegacyFairseqLRScheduler, register_lr_scheduler
+import logging
+import ast
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+
+@register_lr_scheduler("manual")
+class ManualSchedule(LegacyFairseqLRScheduler):
+    """Decay the LR on a manual schedule."""
+
+    def __init__(self, args, optimizer):
+        super().__init__(args, optimizer)
+
+        self.epoch2lr = self.parse_manuallr_args(args.epoch2lr)
+        self.update2lr = self.parse_manuallr_args(args.update2lr)
+        logger.info("@@@ ManualSchedule epoch2lr={}".format(self.epoch2lr))
+        logger.info("@@@ ManualSchedule update2lr={}".format(self.update2lr))
+
+        if 1 in self.epoch2lr:
+            self.lr = self.epoch2lr[1]
+        elif 1 in self.update2lr:
+            self.lr = self.update2lr[1]
+        else:
+            self.lr = args.lr[0]
+        self.optimizer.set_lr(self.lr)  # Set the beginning of the epoch.
+
+    def parse_manuallr_args(self, lr_args_str):
+        lr_dict = ast.literal_eval(lr_args_str.replace(" ", ""))
+        if not isinstance(lr_dict, dict):
+            raise ValueError("epoch2lr/update2lr must be abel to evaluated to a dict")
+
+        lr_args = {}
+        logger.info("@@@ after parsing input dictionary lr_dict = {}".format(lr_dict))
+        for key, val in lr_dict.items():
+            if "," in key:
+                for k in key.split(","):
+                    lr_args[int(k)] = float(val)
+            elif "-" in key:
+                s = int(key.split("-")[0])
+                e = int(key.split("-")[1])
+                for k in range(s, e + 1, 1):
+                    lr_args[k] = float(val)
+            else:
+                lr_args[int(key)] = float(val)
+
+        return lr_args
+
+    @staticmethod
+    def add_args(parser):
+        """Add arguments to the parser for this LR scheduler."""
+        # fmt: off
+        parser.add_argument(
+            "--epoch2lr",
+            type=str,
+            metavar="DICT",
+            default="{}",
+            help="a dictionary used to set lr for each epoch manually",
+        )
+        parser.add_argument(
+            "--update2lr",
+            type=str,
+            metavar="DICT",
+            default="{}",
+            help="a dictionary used to set lr for each update manually",
+        )
+        # fmt: on
+
+    def state_dict(self):
+        return {"lr": self.lr}
+
+    def load_state_dict(self, state_dict):
+        if "lr" in state_dict:
+            self.lr = state_dict["lr"]
+
+    def get_next_lr(self, epoch):
+        manual_keys = [k for k in self.epoch2lr if k <= epoch]
+        if manual_keys:
+            manual_lr = self.epoch2lr[max(manual_keys)]
+        else:
+            logger.warning(
+                "@@@ epoch={} does not exist in manual lr input. epoch2lr={}...".format(
+                    epoch,
+                    list(self.epoch2lr.items())[
+                        : min(10, len(self.epoch2lr.keys()) - 1)
+                    ],
+                )
+            )
+            manual_lr = self.optimizer.get_lr()
+        return manual_lr
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.lr)
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        manual_keys = [k for k in self.update2lr if k <= num_updates]
+        if manual_keys:
+            manual_lr = self.update2lr[max(manual_keys)]
+        else:
+            logger.warning(
+                "epoch={} does not exist in manual lr input update2lr={}...".format(
+                    num_updates,
+                    list(self.update2lr.items())[
+                        : min(10, len(self.update2lr.keys()) - 1)
+                    ],
+                )
+            )
+            manual_lr = self.optimizer.get_lr()
+
+        self.optimizer.set_lr(manual_lr)
+        return self.optimizer.get_lr()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
new file mode 100644
index 0000000000..2f93db328c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class PassThroughScheduleConfig(FairseqDataclass):
+    pass
+
+
+@register_lr_scheduler("pass_through", dataclass=PassThroughScheduleConfig)
+class PassThroughScheduleSchedule(FairseqLRScheduler):
+    """Delegate lr scheduling to the optimizer."""
+
+    def __init__(self, cfg: PassThroughScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        assert (
+            hasattr(optimizer, "lr_scheduler") and optimizer.lr_scheduler is not None
+        ), "Pass-through schedule can only be used with optimizers with their own schedulers"
+
+    def state_dict(self):
+        return self.optimizer.lr_scheduler.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self.optimizer.lr_scheduler.load_state_dict(state_dict)
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        return self.optimizer.lr_scheduler.step_begin_epoch(epoch)
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        return self.optimizer.lr_scheduler.step_update(num_updates)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
new file mode 100644
index 0000000000..b8109a7c1e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import Optional, List
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class PolynomialDecayLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    force_anneal: Optional[int] = field(
+        default=None,
+        metadata={"help": "force annealing at specified epoch"},
+    )
+    end_learning_rate: float = field(
+        default=0.0,
+        metadata={"help": "learning rate to decay to"},
+    )
+    power: float = field(
+        default=1.0,
+        metadata={"help": "decay exponent"},
+    )
+    total_num_update: float = field(
+        default=II("optimization.max_update"),
+        metadata={"help": "total number of updates over which to decay learning rate"},
+    )
+    lr: List[float] = II("optimization.lr")
+
+
+@register_lr_scheduler("polynomial_decay", dataclass=PolynomialDecayLRScheduleConfig)
+class PolynomialDecayLRSchedule(FairseqLRScheduler):
+    """Decay the LR on a fixed schedule."""
+
+    def __init__(self, cfg: PolynomialDecayLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+
+        assert cfg.total_num_update > 0
+
+        self.lr = cfg.lr[0]
+        if cfg.warmup_updates > 0:
+            self.warmup_factor = 1.0 / cfg.warmup_updates
+        else:
+            self.warmup_factor = 1
+        self.end_learning_rate = cfg.end_learning_rate
+        self.total_num_update = cfg.total_num_update
+        self.power = cfg.power
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+
+    def get_next_lr(self, epoch):
+        lrs = self.cfg.lr
+        if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal:
+            # use fixed LR schedule
+            next_lr = lrs[min(epoch, len(lrs) - 1)]
+        else:
+            # annneal based on lr_shrink
+            next_lr = self.optimizer.get_lr()
+        return next_lr
+
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        self.lr = self.get_next_lr(epoch)
+        self.optimizer.set_lr(self.warmup_factor * self.lr)
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if self.cfg.warmup_updates > 0 and num_updates <= self.cfg.warmup_updates:
+            self.warmup_factor = num_updates / float(self.cfg.warmup_updates)
+            lr = self.warmup_factor * self.lr
+        elif num_updates >= self.total_num_update:
+            lr = self.end_learning_rate
+        else:
+            warmup = self.cfg.warmup_updates
+            lr_range = self.lr - self.end_learning_rate
+            pct_remaining = 1 - (num_updates - warmup) / (
+                self.total_num_update - warmup
+            )
+            lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate
+        self.optimizer.set_lr(lr)
+        return self.optimizer.get_lr()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
new file mode 100644
index 0000000000..5ee9c1be4a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
@@ -0,0 +1,143 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+from typing import List
+
+import torch.optim.lr_scheduler
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class ReduceLROnPlateauLRScheduleConfig(FairseqDataclass):
+    lr_shrink: float = field(
+        default=0.1, metadata={"help": "shrink factor for annealing"}
+    )
+    lr_threshold: float = field(
+        default=1e-4,
+        metadata={
+            "help": (
+                "threshold for measuring the new optimum, to only focus on "
+                "significant changes"
+            )
+        },
+    )
+    lr_patience: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "number of epochs with no improvement after which learning rate will "
+                "be reduced"
+            )
+        },
+    )
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = II("optimization.lr")
+    maximize_best_checkpoint_metric: bool = II(
+        "checkpoint.maximize_best_checkpoint_metric"
+    )
+
+
+@register_lr_scheduler(
+    "reduce_lr_on_plateau", dataclass=ReduceLROnPlateauLRScheduleConfig
+)
+class ReduceLROnPlateauLRSchedule(FairseqLRScheduler):
+    """
+    Decay the LR by a factor every time the validation loss plateaus.
+    Also comes with optional warmup phase, where we linearly increase
+    the learning rate from some initial learning rate
+    (``--warmup-init-lr``) until the configured learning rate
+    (``--lr``). Thereafter the lr is adjusted according to original
+    reduce_on_plateau scheme.
+
+    During warmup::
+
+      lrs = torch.linspace(
+          cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates
+      )
+      lr = lrs[update_num]
+    """
+
+    def __init__(self, cfg: ReduceLROnPlateauLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with reduce_lr_on_plateau."
+                " Consider --lr-scheduler=fixed instead."
+            )
+        self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer.optimizer,
+            patience=cfg.lr_patience,
+            factor=cfg.lr_shrink,
+            mode="max" if cfg.maximize_best_checkpoint_metric else "min",
+            threshold=cfg.lr_threshold,
+        )
+        warmup_end_lr = cfg.lr[0]
+        # if no warm up, sets initial lr to be cfg.lr[0]
+        if cfg.warmup_init_lr < 0:
+            cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr
+
+        # linearly warmup for the first cfg.warmup_updates
+        if cfg.warmup_updates > 0:
+            self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates
+
+        # this flag is either set from arg when no warm up, or set by
+        # step_update() when warmup finishes
+        self.warmup_end = True if cfg.warmup_updates <= 0 else False
+
+        # initial learning rate
+        # this self.lr is used only during init and/or warm up period
+        self.lr = warmup_end_lr if self.warmup_end else cfg.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    def state_dict(self):
+        """Return the LR scheduler state dict."""
+        return {
+            "best": self.lr_scheduler.best,
+            "last_epoch": self.lr_scheduler.last_epoch,
+        }
+
+    def load_state_dict(self, state_dict):
+        """Load an LR scheduler state dict."""
+        self.lr_scheduler.best = state_dict["best"]
+        if "last_epoch" in state_dict:
+            self.lr_scheduler.last_epoch = state_dict["last_epoch"]
+
+    def step(self, epoch, val_loss=None):
+        """
+        Update the learning rate at the end of the given epoch if warmup
+        finishes otherwise no update of lr on epoch boundaries
+        """
+        if val_loss is not None and self.warmup_end is True:
+            self.lr_scheduler.step(val_loss)
+        else:
+            self.lr_scheduler.last_epoch = epoch
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """
+        Update the learning rate after each update."""
+        # if there is warmup
+        if self.cfg.warmup_updates > 0:
+            if num_updates <= self.cfg.warmup_updates:
+                self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step
+                self.optimizer.set_lr(self.lr)
+            else:
+                if self.warmup_end is False:
+                    self.warmup_end = True
+        # else do nothing
+        return self.optimizer.get_lr()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
new file mode 100644
index 0000000000..db99d4eee8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class StepLRScheduleConfig(FairseqDataclass):
+    warmup_updates: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    warmup_init_lr: float = field(
+        default=-1,
+        metadata={
+            "help": "initial learning rate during warmup phase; default is cfg.lr"
+        },
+    )
+    lr: List[float] = field(
+        default=II("optimization.lr"),
+        metadata={"help": "max learning rate, must be more than cfg.min_lr"},
+    )
+    min_lr: float = field(default=0.0, metadata={"help": "min learning rate"})
+    lr_deacy_period: int = field(default=25000, metadata={"help": "decay period"})
+    lr_decay: float = field(default=0.5, metadata={"help": "decay factor"})
+
+
+@register_lr_scheduler("step", dataclass=StepLRScheduleConfig)
+class StepLRSchedule(FairseqLRScheduler):
+    """Decay learning rate every k updates by a fixed factor"""
+
+    def __init__(self, cfg: StepLRScheduleConfig, fairseq_optimizer):
+        super().__init__(cfg, fairseq_optimizer)
+        self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr
+        self.min_lr = cfg.min_lr
+        self.lr_deacy_period = cfg.lr_deacy_period
+        self.lr_decay = cfg.lr_decay
+        self.warmup_updates = cfg.warmup_updates
+        self.warmup_init_lr = (
+            cfg.warmup_init_lr if cfg.warmup_init_lr >= 0 else self.min_lr
+        )
+
+        assert self.lr_deacy_period > 0
+        assert self.lr_decay <= 1
+        assert self.min_lr >= 0
+        assert self.max_lr > self.min_lr
+
+        if cfg.warmup_updates > 0:
+            # linearly warmup for the first cfg.warmup_updates
+            self.warmup_lr_step = (
+                self.max_lr - self.warmup_init_lr
+            ) / self.warmup_updates
+        else:
+            self.warmup_lr_step = 1
+
+        # initial learning rate
+        self.lr = self.warmup_init_lr
+        self.optimizer.set_lr(self.lr)
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        if num_updates < self.cfg.warmup_updates:
+            self.lr = self.warmup_init_lr + num_updates * self.warmup_lr_step
+        else:
+            curr_updates = num_updates - self.cfg.warmup_updates
+            lr_mult = self.lr_decay ** (curr_updates // self.lr_deacy_period)
+            self.lr = max(self.max_lr * lr_mult, self.min_lr)
+
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
new file mode 100644
index 0000000000..4d5547c39b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
@@ -0,0 +1,175 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+from typing import Optional, List, Tuple
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class TriStageLRScheduleConfig(FairseqDataclass):
+    warmup_steps: int = field(
+        default=0,
+        metadata={"help": "warmup the learning rate linearly for the first N updates"},
+    )
+    hold_steps: int = field(
+        default=0,
+        metadata={"help": "steps in hold stage"},
+    )
+    decay_steps: int = field(
+        default=0,
+        metadata={"help": "steps in decay stages"},
+    )
+    phase_ratio: Optional[Tuple[float, float, float]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "if set, automatically sets warmup/hold/decay steps to the ratio "
+                "specified here from max_updates. the ratios must add up to 1.0"
+            )
+        },
+    )
+    init_lr_scale: float = field(
+        default=0.01,
+        metadata={"help": "initial learning rate scale during warmup phase"},
+    )
+    final_lr_scale: float = field(
+        default=0.01,
+        metadata={"help": "final learning rate scale"},
+    )
+    max_update: float = II("optimization.max_update")
+    lr: List[float] = II("optimization.lr")
+
+
+@register_lr_scheduler("tri_stage", dataclass=TriStageLRScheduleConfig)
+class TriStageLRSchedule(FairseqLRScheduler):
+    """Tristage learning rate schedulr
+
+    Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf
+
+    Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
+    three stages LR scheduling:
+
+        - warmup stage, starting from `lr` * `init_lr_scale`, linearly
+          increased to `lr` in `warmup_steps` iterations
+
+        - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
+          iterations
+
+        - decay stage, after hold stage, decay LR exponetially to
+          `lr` * `final_lr_scale` in `decay_steps`;
+          after that LR is keep as `final_lr_scale` * `lr`
+
+    During warmup::
+
+      init_lr = cfg.init_lr_scale * cfg.lr
+      lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps)
+      lr = lrs[update_num]
+
+    During hold::
+
+      lr = cfg.lr
+
+    During decay::
+
+      decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps
+      lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
+
+    After that::
+
+      lr = cfg.lr * cfg.final_lr_scale
+    """
+
+    def __init__(self, cfg: TriStageLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with tri-stage lr."
+                " Consider --lr-scheduler=fixed instead."
+            )
+
+        # calculate LR at each point
+        self.peak_lr = cfg.lr[0]
+        self.init_lr = cfg.init_lr_scale * cfg.lr[0]
+        self.final_lr = cfg.final_lr_scale * cfg.lr[0]
+
+        if cfg.phase_ratio is not None:
+            assert cfg.max_update > 0
+            assert sum(cfg.phase_ratio) == 1, "phase ratios must add up to 1"
+            self.warmup_steps = int(cfg.max_update * cfg.phase_ratio[0])
+            self.hold_steps = int(cfg.max_update * cfg.phase_ratio[1])
+            self.decay_steps = int(cfg.max_update * cfg.phase_ratio[2])
+        else:
+            self.warmup_steps = cfg.warmup_steps
+            self.hold_steps = cfg.hold_steps
+            self.decay_steps = cfg.decay_steps
+
+        assert (
+            self.warmup_steps + self.hold_steps + self.decay_steps > 0
+        ), "please specify steps or phase_ratio"
+
+        self.warmup_rate = (
+            (self.peak_lr - self.init_lr) / self.warmup_steps
+            if self.warmup_steps != 0
+            else 0
+        )
+        self.decay_factor = -math.log(cfg.final_lr_scale) / self.decay_steps
+
+        # initial learning rate
+        self.lr = self.init_lr
+        self.optimizer.set_lr(self.lr)
+
+    def _decide_stage(self, update_step):
+        """
+        return stage, and the corresponding steps within the current stage
+        """
+        if update_step < self.warmup_steps:
+            # warmup state
+            return 0, update_step
+
+        offset = self.warmup_steps
+
+        if update_step < offset + self.hold_steps:
+            # hold stage
+            return 1, update_step - offset
+
+        offset += self.hold_steps
+
+        if update_step <= offset + self.decay_steps:
+            # decay stage
+            return 2, update_step - offset
+
+        offset += self.decay_steps
+
+        # still here ? constant lr stage
+        return 3, update_step - offset
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        stage, steps_in_stage = self._decide_stage(num_updates)
+        if stage == 0:
+            self.lr = self.init_lr + self.warmup_rate * steps_in_stage
+        elif stage == 1:
+            self.lr = self.peak_lr
+        elif stage == 2:
+            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
+        elif stage == 3:
+            self.lr = self.final_lr
+        else:
+            raise ValueError("Undefined stage")
+
+        self.optimizer.set_lr(self.lr)
+
+        return self.lr
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
new file mode 100644
index 0000000000..2a32bd10f2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass, field
+from typing import List
+
+from omegaconf import II
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler
+
+
+@dataclass
+class TriangularLRScheduleConfig(FairseqDataclass):
+    max_lr: float = field(
+        default="???", metadata={"help": "max learning rate, must be more than cfg.lr"}
+    )
+    lr_period_updates: float = field(
+        default=5000,
+        metadata={"help": "initial number of updates per period (cycle length)"},
+    )
+    lr_shrink: float = field(
+        default=0.1, metadata={"help": "shrink factor for annealing"}
+    )
+    shrink_min: bool = field(
+        default=False, metadata={"help": "if set, also shrinks min lr"}
+    )
+    lr: List[float] = II("optimization.lr")
+
+
+@register_lr_scheduler("triangular", dataclass=TriangularLRScheduleConfig)
+class TriangularLRSchedule(FairseqLRScheduler):
+    """Assign LR based on a triangular cyclical schedule.
+
+    See https://arxiv.org/pdf/1506.01186.pdf for details.
+    """
+
+    def __init__(self, cfg: TriangularLRScheduleConfig, optimizer):
+        super().__init__(cfg, optimizer)
+        if len(cfg.lr) > 1:
+            raise ValueError(
+                "Cannot use a fixed learning rate schedule with triangular."
+                " Consider --lr-scheduler=fixed instead."
+            )
+
+        lr = cfg.lr[0]
+
+        assert cfg.max_lr > lr, "max_lr must be more than lr"
+        self.min_lr = lr
+        self.max_lr = cfg.max_lr
+        self.stepsize = cfg.lr_period_updates // 2
+        self.lr_shrink = cfg.lr_shrink
+        self.shrink_min = cfg.shrink_min
+
+        # initial learning rate
+        self.lr = self.min_lr
+        self.optimizer.set_lr(self.lr)
+
+    def step(self, epoch, val_loss=None):
+        """Update the learning rate at the end of the given epoch."""
+        super().step(epoch, val_loss)
+        # we don't change the learning rate at epoch boundaries
+        return self.optimizer.get_lr()
+
+    def step_update(self, num_updates):
+        """Update the learning rate after each update."""
+        cycle = math.floor(num_updates / (2 * self.stepsize))
+
+        lr_shrink = self.lr_shrink**cycle
+        max_lr = self.max_lr * lr_shrink
+        if self.shrink_min:
+            min_lr = self.min_lr * lr_shrink
+        else:
+            min_lr = self.min_lr
+
+        x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1)
+        self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x))
+
+        self.optimizer.set_lr(self.lr)
+        return self.lr
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
new file mode 100644
index 0000000000..c30a6c0fb1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
@@ -0,0 +1,111 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections.abc import Collection
+from dataclasses import dataclass, field
+from typing import List
+
+import torch
+from fairseq.dataclass import FairseqDataclass
+from omegaconf import II, DictConfig
+from torch.optim.optimizer import Optimizer, required
+
+from . import FairseqOptimizer, register_optimizer
+
+
+@dataclass
+class FairseqNAGConfig(FairseqDataclass):
+    momentum: float = field(default=0.99, metadata={"help": "momentum factor"})
+    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
+    # TODO common vars in parent class
+    lr: List[float] = II("optimization.lr")
+
+
+@register_optimizer("nag", dataclass=FairseqNAGConfig)
+class FairseqNAG(FairseqOptimizer):
+    def __init__(self, cfg: DictConfig, params):
+        super().__init__(cfg)
+        self._optimizer = NAG(params, **self.optimizer_config)
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.cfg.lr[0]
+            if isinstance(self.cfg.lr, Collection)
+            else self.cfg.lr,
+            "momentum": self.cfg.momentum,
+            "weight_decay": self.cfg.weight_decay,
+        }
+
+
+class NAG(Optimizer):
+    def __init__(self, params, lr=required, momentum=0, weight_decay=0):
+        defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay)
+        super(NAG, self).__init__(params, defaults)
+
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+
+    @property
+    def supports_flat_params(self):
+        return True
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group["weight_decay"]
+            momentum = group["momentum"]
+            lr = group["lr"]
+            lr_old = group.get("lr_old", lr)
+            lr_correct = lr / lr_old if lr_old > 0 else lr
+
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+
+                p_data_fp32 = p.data
+                if p_data_fp32.dtype in {torch.float16, torch.bfloat16}:
+                    p_data_fp32 = p_data_fp32.float()
+
+                d_p = p.grad.data.float()
+                param_state = self.state[p]
+                if "momentum_buffer" not in param_state:
+                    param_state["momentum_buffer"] = torch.zeros_like(d_p)
+                else:
+                    param_state["momentum_buffer"] = param_state["momentum_buffer"].to(
+                        d_p
+                    )
+
+                buf = param_state["momentum_buffer"]
+
+                if weight_decay != 0:
+                    p_data_fp32.mul_(1 - lr * weight_decay)
+                p_data_fp32.add_(buf, alpha=momentum * momentum * lr_correct)
+                p_data_fp32.add_(d_p, alpha=-(1 + momentum) * lr)
+
+                buf.mul_(momentum * lr_correct).add_(d_p, alpha=-lr)
+
+                if p.data.dtype in {torch.float16, torch.bfloat16}:
+                    p.data.copy_(p_data_fp32)
+
+            group["lr_old"] = lr
+
+        return loss
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
new file mode 100644
index 0000000000..8e34fb99a1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.optim
+
+from . import LegacyFairseqOptimizer, register_optimizer
+
+
+@register_optimizer("sgd")
+class SGD(LegacyFairseqOptimizer):
+    def __init__(self, args, params):
+        super().__init__(args)
+        self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
+
+    @staticmethod
+    def add_args(parser):
+        """Add optimizer-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--momentum', default=0.0, type=float, metavar='M',
+                            help='momentum factor')
+        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
+                            help='weight decay')
+        # fmt: on
+
+    @property
+    def optimizer_config(self):
+        """
+        Return a kwarg dictionary that will be used to override optimizer
+        args stored in checkpoints. This allows us to load a checkpoint and
+        resume training using a different set of optimizer args, e.g., with a
+        different learning rate.
+        """
+        return {
+            "lr": self.args.lr[0],
+            "momentum": self.args.momentum,
+            "weight_decay": self.args.weight_decay,
+        }
+
+    @property
+    def supports_flat_params(self):
+        return True
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
new file mode 100644
index 0000000000..9d7f2eb9e5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict
+
+from fairseq.distributed import utils
+
+
+try:
+    from fairscale.optim import OSS
+
+    _has_fairscale = True
+except ImportError:
+    _has_fairscale = False
+
+
+def shard_(optimizer, group):
+    if not _has_fairscale:
+        raise ImportError(
+            "\n\nPlease install the fairscale package:" "\n\n  pip install fairscale"
+        )
+
+    class FairseqOSS(OSS):
+        @property
+        def disable_mem_eff_fp16_loading_hack(self):
+            return True
+
+        def __getattr__(self, name):
+            if name.startswith("supports") and hasattr(self.optim, name):
+                return getattr(self.optim, name)
+            raise AttributeError(
+                "'FairseqOSS' object has no attribute {0!r}".format(name)
+            )
+
+        def broadcast_global_state_dict(
+            self, state_dict: Dict[str, Any]
+        ) -> Dict[str, Any]:
+            """
+            Broadcasts the entire state_dict to all other ranks
+            each rank is responsible to load their own partition of data
+            """
+            return utils.broadcast_object(
+                state_dict,
+                src_rank=0,
+                group=self.group,
+            )
+
+    torch_optimizer = optimizer.optimizer
+    optim_cls = type(torch_optimizer)
+
+    optimizer.optimizer = FairseqOSS(
+        torch_optimizer.param_groups,
+        optim_cls,
+        group=group,
+        **optimizer.optimizer_config
+    )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
new file mode 100644
index 0000000000..920591635a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
@@ -0,0 +1,413 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from pathlib import Path
+from typing import Callable, List, Optional, Union
+
+import torch
+from fairseq import utils
+from fairseq.data.indexed_dataset import get_available_dataset_impl
+from fairseq.dataclass.configs import (
+    CheckpointConfig,
+    CommonConfig,
+    CommonEvalConfig,
+    DatasetConfig,
+    DistributedTrainingConfig,
+    EvalLMConfig,
+    GenerationConfig,
+    InteractiveConfig,
+    OptimizationConfig,
+    EMAConfig,
+)
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+
+# this import is for backward compatibility
+from fairseq.utils import csv_str_list, eval_bool, eval_str_dict, eval_str_list  # noqa
+
+
+def get_preprocessing_parser(default_task="translation"):
+    parser = get_parser("Preprocessing", default_task)
+    add_preprocess_args(parser)
+    return parser
+
+
+def get_training_parser(default_task="translation"):
+    parser = get_parser("Trainer", default_task)
+    add_dataset_args(parser, train=True)
+    add_distributed_training_args(parser)
+    add_model_args(parser)
+    add_optimization_args(parser)
+    add_checkpoint_args(parser)
+    add_ema_args(parser)
+    return parser
+
+
+def get_generation_parser(interactive=False, default_task="translation"):
+    parser = get_parser("Generation", default_task)
+    add_dataset_args(parser, gen=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    add_generation_args(parser)
+    add_checkpoint_args(parser)
+    if interactive:
+        add_interactive_args(parser)
+    return parser
+
+
+def get_speech_generation_parser(default_task="text_to_speech"):
+    parser = get_parser("Speech Generation", default_task)
+    add_dataset_args(parser, gen=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    add_speech_generation_args(parser)
+    return parser
+
+
+def get_interactive_generation_parser(default_task="translation"):
+    return get_generation_parser(interactive=True, default_task=default_task)
+
+
+def get_eval_lm_parser(default_task="language_modeling"):
+    parser = get_parser("Evaluate Language Model", default_task)
+    add_dataset_args(parser, gen=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    add_eval_lm_args(parser)
+    return parser
+
+
+def get_validation_parser(default_task=None):
+    parser = get_parser("Validation", default_task)
+    add_dataset_args(parser, train=True)
+    add_distributed_training_args(parser, default_world_size=1)
+    group = parser.add_argument_group("Evaluation")
+    gen_parser_from_dataclass(group, CommonEvalConfig())
+    return parser
+
+
+def parse_args_and_arch(
+    parser: argparse.ArgumentParser,
+    input_args: List[str] = None,
+    parse_known: bool = False,
+    suppress_defaults: bool = False,
+    modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None,
+):
+    """
+    Args:
+        parser (ArgumentParser): the parser
+        input_args (List[str]): strings to parse, defaults to sys.argv
+        parse_known (bool): only parse known arguments, similar to
+            `ArgumentParser.parse_known_args`
+        suppress_defaults (bool): parse while ignoring all default values
+        modify_parser (Optional[Callable[[ArgumentParser], None]]):
+            function to modify the parser, e.g., to set default values
+    """
+    if suppress_defaults:
+        # Parse args without any default values. This requires us to parse
+        # twice, once to identify all the necessary task/model args, and a second
+        # time with all defaults set to None.
+        args = parse_args_and_arch(
+            parser,
+            input_args=input_args,
+            parse_known=parse_known,
+            suppress_defaults=False,
+        )
+        suppressed_parser = argparse.ArgumentParser(add_help=False, parents=[parser])
+        suppressed_parser.set_defaults(**{k: None for k, v in vars(args).items()})
+        args = suppressed_parser.parse_args(input_args)
+        return argparse.Namespace(
+            **{k: v for k, v in vars(args).items() if v is not None}
+        )
+
+    from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_CONFIG_REGISTRY, MODEL_REGISTRY
+
+    # Before creating the true parser, we need to import optional user module
+    # in order to eagerly import custom tasks, optimizers, architectures, etc.
+    usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
+    usr_parser.add_argument("--user-dir", default=None)
+    usr_args, _ = usr_parser.parse_known_args(input_args)
+    utils.import_user_module(usr_args)
+
+    if modify_parser is not None:
+        modify_parser(parser)
+
+    # The parser doesn't know about model/criterion/optimizer-specific args, so
+    # we parse twice. First we parse the model/criterion/optimizer, then we
+    # parse a second time after adding the *-specific arguments.
+    # If input_args is given, we will parse those args instead of sys.argv.
+    args, _ = parser.parse_known_args(input_args)
+
+    # Add model-specific args to parser.
+    if hasattr(args, "arch"):
+        model_specific_group = parser.add_argument_group(
+            "Model-specific configuration",
+            # Only include attributes which are explicitly given as command-line
+            # arguments or which have default values.
+            argument_default=argparse.SUPPRESS,
+        )
+        if args.arch in ARCH_MODEL_REGISTRY:
+            ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group)
+        elif args.arch in MODEL_REGISTRY:
+            MODEL_REGISTRY[args.arch].add_args(model_specific_group)
+        else:
+            raise RuntimeError()
+
+    if hasattr(args, "task"):
+        from fairseq.tasks import TASK_REGISTRY
+
+        TASK_REGISTRY[args.task].add_args(parser)
+    if getattr(args, "use_bmuf", False):
+        # hack to support extra args for block distributed data parallelism
+        from fairseq.optim.bmuf import FairseqBMUF
+
+        FairseqBMUF.add_args(parser)
+
+    # Add *-specific args to parser.
+    from fairseq.registry import REGISTRIES
+
+    for registry_name, REGISTRY in REGISTRIES.items():
+        choice = getattr(args, registry_name, None)
+        if choice is not None:
+            cls = REGISTRY["registry"][choice]
+            if hasattr(cls, "add_args"):
+                cls.add_args(parser)
+            elif hasattr(cls, "__dataclass"):
+                gen_parser_from_dataclass(parser, cls.__dataclass())
+
+    # Modify the parser a second time, since defaults may have been reset
+    if modify_parser is not None:
+        modify_parser(parser)
+
+    # Parse a second time.
+    if parse_known:
+        args, extra = parser.parse_known_args(input_args)
+    else:
+        args = parser.parse_args(input_args)
+        extra = None
+    # Post-process args.
+    if (
+        hasattr(args, "batch_size_valid") and args.batch_size_valid is None
+    ) or not hasattr(args, "batch_size_valid"):
+        args.batch_size_valid = args.batch_size
+    if hasattr(args, "max_tokens_valid") and args.max_tokens_valid is None:
+        args.max_tokens_valid = args.max_tokens
+    if getattr(args, "memory_efficient_fp16", False):
+        args.fp16 = True
+    if getattr(args, "memory_efficient_bf16", False):
+        args.bf16 = True
+    args.tpu = getattr(args, "tpu", False)
+    args.bf16 = getattr(args, "bf16", False)
+    if args.bf16:
+        args.tpu = True
+    if args.tpu and args.fp16:
+        raise ValueError("Cannot combine --fp16 and --tpu, use --bf16 on TPUs")
+
+    if getattr(args, "seed", None) is None:
+        args.seed = 1  # default seed for training
+        args.no_seed_provided = True
+    else:
+        args.no_seed_provided = False
+
+    if getattr(args, "update_epoch_batch_itr", None) is None:
+        if hasattr(args, "grouped_shuffling"):
+            args.update_epoch_batch_itr = args.grouped_shuffling
+        else:
+            args.grouped_shuffling = False
+            args.update_epoch_batch_itr = False
+
+    # Apply architecture configuration.
+    if hasattr(args, "arch") and args.arch in ARCH_CONFIG_REGISTRY:
+        ARCH_CONFIG_REGISTRY[args.arch](args)
+
+    if parse_known:
+        return args, extra
+    else:
+        return args
+
+
+def get_parser(desc, default_task="translation"):
+    # Before creating the true parser, we need to import optional user module
+    # in order to eagerly import custom tasks, optimizers, architectures, etc.
+    usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
+    usr_parser.add_argument("--user-dir", default=None)
+    usr_args, _ = usr_parser.parse_known_args()
+    utils.import_user_module(usr_args)
+
+    parser = argparse.ArgumentParser(allow_abbrev=False)
+    gen_parser_from_dataclass(parser, CommonConfig())
+
+    from fairseq.registry import REGISTRIES
+
+    for registry_name, REGISTRY in REGISTRIES.items():
+        parser.add_argument(
+            "--" + registry_name.replace("_", "-"),
+            default=REGISTRY["default"],
+            choices=REGISTRY["registry"].keys(),
+        )
+
+    # Task definitions can be found under fairseq/tasks/
+    from fairseq.tasks import TASK_REGISTRY
+
+    parser.add_argument(
+        "--task",
+        metavar="TASK",
+        default=default_task,
+        choices=TASK_REGISTRY.keys(),
+        help="task",
+    )
+    # fmt: on
+    return parser
+
+
+def add_preprocess_args(parser):
+    group = parser.add_argument_group("Preprocessing")
+    # fmt: off
+    group.add_argument("-s", "--source-lang", default=None, metavar="SRC",
+                       help="source language")
+    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
+                       help="target language")
+    group.add_argument("--trainpref", metavar="FP", default=None,
+                       help="train file prefix (also used to build dictionaries)")
+    group.add_argument("--validpref", metavar="FP", default=None,
+                       help="comma separated, valid file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
+    group.add_argument("--testpref", metavar="FP", default=None,
+                       help="comma separated, test file prefixes "
+                            "(words missing from train set are replaced with <unk>)")
+    group.add_argument("--align-suffix", metavar="FP", default=None,
+                       help="alignment file suffix")
+    group.add_argument("--destdir", metavar="DIR", default="data-bin",
+                       help="destination dir")
+    group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
+                       help="map words appearing less than threshold times to unknown")
+    group.add_argument("--thresholdsrc", metavar="N", default=0, type=int,
+                       help="map words appearing less than threshold times to unknown")
+    group.add_argument("--tgtdict", metavar="FP",
+                       help="reuse given target dictionary")
+    group.add_argument("--srcdict", metavar="FP",
+                       help="reuse given source dictionary")
+    group.add_argument("--nwordstgt", metavar="N", default=-1, type=int,
+                       help="number of target words to retain")
+    group.add_argument("--nwordssrc", metavar="N", default=-1, type=int,
+                       help="number of source words to retain")
+    group.add_argument("--alignfile", metavar="ALIGN", default=None,
+                       help="an alignment file (optional)")
+    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
+                        choices=get_available_dataset_impl(),
+                        help='output dataset implementation')
+    group.add_argument("--joined-dictionary", action="store_true",
+                       help="Generate joined dictionary")
+    group.add_argument("--only-source", action="store_true",
+                       help="Only process the source language")
+    group.add_argument("--padding-factor", metavar="N", default=8, type=int,
+                       help="Pad dictionary size to be multiple of N")
+    group.add_argument("--workers", metavar="N", default=1, type=int,
+                       help="number of parallel workers")
+    group.add_argument("--dict-only", action='store_true',
+                       help="if true, only builds a dictionary and then exits")
+    # fmt: on
+    return parser
+
+
+def add_dataset_args(parser, train=False, gen=False):
+    group = parser.add_argument_group("dataset_data_loading")
+    gen_parser_from_dataclass(group, DatasetConfig())
+    # fmt: on
+    return group
+
+
+def add_distributed_training_args(parser, default_world_size=None):
+    group = parser.add_argument_group("distributed_training")
+    if default_world_size is None:
+        default_world_size = max(1, torch.cuda.device_count())
+    gen_parser_from_dataclass(
+        group, DistributedTrainingConfig(distributed_world_size=default_world_size)
+    )
+    return group
+
+
+def add_optimization_args(parser):
+    group = parser.add_argument_group("optimization")
+    # fmt: off
+    gen_parser_from_dataclass(group, OptimizationConfig())
+    # fmt: on
+    return group
+
+
+def add_checkpoint_args(parser):
+    group = parser.add_argument_group("checkpoint")
+    # fmt: off
+    gen_parser_from_dataclass(group, CheckpointConfig())
+    # fmt: on
+    return group
+
+
+def add_common_eval_args(group):
+    gen_parser_from_dataclass(group, CommonEvalConfig())
+
+
+def add_eval_lm_args(parser):
+    group = parser.add_argument_group("LM Evaluation")
+    add_common_eval_args(group)
+    gen_parser_from_dataclass(group, EvalLMConfig())
+
+
+def add_generation_args(parser):
+    group = parser.add_argument_group("Generation")
+    add_common_eval_args(group)
+    gen_parser_from_dataclass(group, GenerationConfig())
+    return group
+
+
+def add_speech_generation_args(parser):
+    group = parser.add_argument_group("Speech Generation")
+    add_common_eval_args(group)  # NOTE: remove_bpe is not needed
+    # fmt: off
+    group.add_argument('--eos_prob_threshold', default=0.5, type=float,
+                       help='terminate when eos probability exceeds this')
+    # fmt: on
+    return group
+
+
+def add_interactive_args(parser):
+    group = parser.add_argument_group("Interactive")
+    gen_parser_from_dataclass(group, InteractiveConfig())
+
+
+def add_model_args(parser):
+    group = parser.add_argument_group("Model configuration")
+    # fmt: off
+
+    # Model definitions can be found under fairseq/models/
+    #
+    # The model architecture can be specified in several ways.
+    # In increasing order of priority:
+    # 1) model defaults (lowest priority)
+    # 2) --arch argument
+    # 3) --encoder/decoder-* arguments (highest priority)
+    from fairseq.models import ARCH_MODEL_REGISTRY
+    group.add_argument('--arch', '-a', metavar='ARCH',
+                       choices=ARCH_MODEL_REGISTRY.keys(),
+                       help='model architecture')
+    # fmt: on
+    return group
+
+
+def get_args(
+    data: Union[str, Path],
+    task: str = "translation",
+    arch: str = "transformer",
+    **overrides
+):
+    parser = get_training_parser(task)
+    args = parse_args_and_arch(parser, [str(data), "--task", task, "--arch", arch])
+
+    for k, v in overrides.items():
+        setattr(args, k, v)
+
+    return args
+
+
+def add_ema_args(parser):
+    group = parser.add_argument_group("EMA configuration")
+    gen_parser_from_dataclass(group, EMAConfig())
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
new file mode 100644
index 0000000000..1ba6ef0d33
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import multiprocessing
+import os
+import pdb
+import sys
+
+
+__all__ = ["set_trace"]
+
+
+_stdin = [None]
+_stdin_lock = multiprocessing.Lock()
+try:
+    _stdin_fd = sys.stdin.fileno()
+except Exception:
+    _stdin_fd = None
+
+
+class MultiprocessingPdb(pdb.Pdb):
+    """A Pdb wrapper that works in a multiprocessing environment.
+
+    Usage: `from fairseq import pdb; pdb.set_trace()`
+    """
+
+    def __init__(self):
+        pdb.Pdb.__init__(self, nosigint=True)
+
+    def _cmdloop(self):
+        stdin_bak = sys.stdin
+        with _stdin_lock:
+            try:
+                if _stdin_fd is not None:
+                    if not _stdin[0]:
+                        _stdin[0] = os.fdopen(_stdin_fd)
+                    sys.stdin = _stdin[0]
+                self.cmdloop()
+            finally:
+                sys.stdin = stdin_bak
+
+
+def set_trace():
+    pdb = MultiprocessingPdb()
+    pdb.set_trace(sys._getframe().f_back)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
new file mode 100644
index 0000000000..11fc414c85
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
@@ -0,0 +1,143 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from fairseq.modules.quantization import pq, quantization_options, scalar
+from omegaconf import DictConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+def quantize_model_scalar(model, model_cfg: DictConfig):
+    quant_noise_scalar = getattr(model_cfg, "quant_noise_scalar", 0) or 0
+    if quant_noise_scalar > 0:
+        # quantize_model edits the model in place
+        scalar.quantize_model_(model, p=quant_noise_scalar, bits=8, update_step=1000)
+    return model
+
+
+class Quantizer(object):
+    def __init__(self, config_path, max_epoch, max_update):
+        try:
+            import yaml
+        except ImportError:
+            raise ImportError("Please install yaml with: pip install yaml")
+
+        # parse config
+        if config_path:
+            with open(config_path) as config_file:
+                config = quantization_options.parse_config_yaml(
+                    yaml.safe_load(config_file)
+                )
+        else:
+            config = quantization_options.parse_config_yaml({})
+
+        self.n_centroids_config = config["n_centroids"]
+        self.block_sizes_config = config["block_sizes"]
+        self.layers_to_quantize = config["layers_to_quantize"]
+
+        # We assume that training will run for a fixed number of epochs
+        # (or updates) and that we should train for equal durations
+        # between iterations of PQ.
+        num_iterations = len(self.layers_to_quantize)
+        if max_epoch > 0:
+            assert max_epoch % num_iterations == 0, (
+                "for iterative PQ, --max-epoch (={}) must be evenly divisible by "
+                "len(layers_to_quantize) (={})".format(max_epoch, num_iterations)
+            )
+            self.epoch_schedule = max_epoch // num_iterations
+        else:
+            self.epoch_schedule = None
+        if max_update > 0:
+            assert max_update % num_iterations == 0, (
+                "for iterative PQ, --max-update (={}) must be evenly divisible by "
+                "len(layers_to_quantize) (={})".format(max_update, num_iterations)
+            )
+            self.update_schedule = max_update // num_iterations
+        else:
+            self.update_schedule = None
+        assert (self.epoch_schedule is not None) ^ (
+            self.update_schedule is not None
+        ), "for iterative PQ, cannot specify both --max-update and --max-epoch"
+
+        # 0 is a special value for quantization step, which will force
+        # the first call to begin_epoch() to call step()
+        self.quantization_step = 0
+
+    def set_trainer(self, trainer):
+        self.trainer = trainer
+        self.size_tracker = pq.SizeTracker(self.trainer.get_model())
+
+    def step(self):
+        """Move to the next stage of quantization."""
+        if self.quantization_step >= len(self.layers_to_quantize):
+            # Maybe we just finished the last training step or we loaded
+            # a checkpoint for an iterative PQ model which previously
+            # finished training. Either way, don't quantize again.
+            return
+
+        logger.info(
+            "quantizing model (step={}; layers_to_quantize[step]={})".format(
+                self.quantization_step, self.layers_to_quantize[self.quantization_step]
+            )
+        )
+        quantized_layers = pq.quantize_model_(
+            self.trainer.get_model(),
+            self.size_tracker,
+            self.layers_to_quantize,
+            self.block_sizes_config,
+            self.n_centroids_config,
+            step=self.quantization_step,
+        )
+        logger.info("quantized layers: {}".format(quantized_layers))
+        logger.info(self.size_tracker)
+
+        self.quantization_step += 1
+
+        # reintialize the Trainer since model parameters have changed
+        self.trainer.reinitialize()
+
+    def begin_epoch(self, epoch):
+        """Called at the beginning of each epoch (epochs start at 1)."""
+        if (
+            (
+                self.epoch_schedule is not None
+                and epoch > 0
+                and (epoch - 1) % self.epoch_schedule == 0
+            )
+            # we always step once in the beginning, even if using
+            # update-based quantization
+            or self.quantization_step == 0
+        ):
+            self.step()
+
+    def step_update(self, num_updates):
+        """Called at the end of each step."""
+        if (
+            self.update_schedule is not None
+            and num_updates > 0
+            and num_updates % self.update_schedule == 0
+        ):
+            self.step()
+
+    def state_dict(self):
+        return {
+            "n_centroids_config": self.n_centroids_config,
+            "block_sizes_config": self.block_sizes_config,
+            "layers_to_quantize": self.layers_to_quantize,
+            "epoch_schedule": self.epoch_schedule,
+            "update_schedule": self.update_schedule,
+            "quantization_step": self.quantization_step,
+        }
+
+    def load_state_dict(self, state_dict):
+        self.n_centroids_config = state_dict["n_centroids_config"]
+        self.block_sizes_config = state_dict["block_sizes_config"]
+        self.layers_to_quantize = state_dict["layers_to_quantize"]
+        self.epoch_schedule = state_dict["epoch_schedule"]
+        self.update_schedule = state_dict["update_schedule"]
+        self.quantization_step = state_dict["quantization_step"]
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
new file mode 100644
index 0000000000..904ffcd602
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from argparse import Namespace
+
+from typing import Union
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import merge_with_parent
+from hydra.core.config_store import ConfigStore
+from omegaconf import DictConfig
+
+REGISTRIES = {}
+
+
+def setup_registry(registry_name: str, base_class=None, default=None, required=False):
+    assert registry_name.startswith("--")
+    registry_name = registry_name[2:].replace("-", "_")
+
+    REGISTRY = {}
+    REGISTRY_CLASS_NAMES = set()
+    DATACLASS_REGISTRY = {}
+
+    # maintain a registry of all registries
+    if registry_name in REGISTRIES:
+        return  # registry already exists
+    REGISTRIES[registry_name] = {
+        "registry": REGISTRY,
+        "default": default,
+        "dataclass_registry": DATACLASS_REGISTRY,
+    }
+
+    def build_x(cfg: Union[DictConfig, str, Namespace], *extra_args, **extra_kwargs):
+        if isinstance(cfg, DictConfig):
+            choice = cfg._name
+
+            if choice and choice in DATACLASS_REGISTRY:
+                from_checkpoint = extra_kwargs.get("from_checkpoint", False)
+                dc = DATACLASS_REGISTRY[choice]
+                cfg = merge_with_parent(dc(), cfg, remove_missing=from_checkpoint)
+        elif isinstance(cfg, str):
+            choice = cfg
+            if choice in DATACLASS_REGISTRY:
+                cfg = DATACLASS_REGISTRY[choice]()
+        else:
+            choice = getattr(cfg, registry_name, None)
+            if choice in DATACLASS_REGISTRY:
+                cfg = DATACLASS_REGISTRY[choice].from_namespace(cfg)
+
+        if choice is None:
+            if required:
+                raise ValueError("{} is required!".format(registry_name))
+            return None
+
+        cls = REGISTRY[choice]
+        if hasattr(cls, "build_" + registry_name):
+            builder = getattr(cls, "build_" + registry_name)
+        else:
+            builder = cls
+
+        if "from_checkpoint" in extra_kwargs:
+            del extra_kwargs["from_checkpoint"]
+
+        return builder(cfg, *extra_args, **extra_kwargs)
+
+    def register_x(name, dataclass=None):
+        def register_x_cls(cls):
+            if name in REGISTRY:
+                raise ValueError(
+                    "Cannot register duplicate {} ({})".format(registry_name, name)
+                )
+            if cls.__name__ in REGISTRY_CLASS_NAMES:
+                raise ValueError(
+                    "Cannot register {} with duplicate class name ({})".format(
+                        registry_name, cls.__name__
+                    )
+                )
+            if base_class is not None and not issubclass(cls, base_class):
+                raise ValueError(
+                    "{} must extend {}".format(cls.__name__, base_class.__name__)
+                )
+
+            if dataclass is not None and not issubclass(dataclass, FairseqDataclass):
+                raise ValueError(
+                    "Dataclass {} must extend FairseqDataclass".format(dataclass)
+                )
+
+            cls.__dataclass = dataclass
+            if cls.__dataclass is not None:
+                DATACLASS_REGISTRY[name] = cls.__dataclass
+
+                cs = ConfigStore.instance()
+                node = dataclass()
+                node._name = name
+                cs.store(name=name, group=registry_name, node=node, provider="fairseq")
+
+            REGISTRY[name] = cls
+
+            return cls
+
+        return register_x_cls
+
+    return build_x, register_x, REGISTRY, DATACLASS_REGISTRY
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
new file mode 100644
index 0000000000..58f2f563e4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
@@ -0,0 +1,55 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import importlib
+import os
+from abc import ABC, abstractmethod
+
+from fairseq import registry
+from omegaconf import DictConfig
+
+
+class BaseScorer(ABC):
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.ref = []
+        self.pred = []
+
+    def add_string(self, ref, pred):
+        self.ref.append(ref)
+        self.pred.append(pred)
+
+    @abstractmethod
+    def score(self) -> float:
+        pass
+
+    @abstractmethod
+    def result_string(self) -> str:
+        pass
+
+
+_build_scorer, register_scorer, SCORER_REGISTRY, _ = registry.setup_registry(
+    "--scoring", default="bleu"
+)
+
+
+def build_scorer(choice, tgt_dict):
+    _choice = choice._name if isinstance(choice, DictConfig) else choice
+
+    if _choice == "bleu":
+        from fairseq.scoring import bleu
+
+        return bleu.Scorer(
+            bleu.BleuConfig(pad=tgt_dict.pad(), eos=tgt_dict.eos(), unk=tgt_dict.unk())
+        )
+    return _build_scorer(choice)
+
+
+# automatically import any Python files in the current directory
+for file in sorted(os.listdir(os.path.dirname(__file__))):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("fairseq.scoring." + module)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
new file mode 100644
index 0000000000..6d5a8450d3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+import numpy as np
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.scoring import BaseScorer, register_scorer
+
+
+@dataclass
+class BertScoreScorerConfig(FairseqDataclass):
+    bert_score_lang: str = field(default="en", metadata={"help": "BERTScore language"})
+
+
+@register_scorer("bert_score", dataclass=BertScoreScorerConfig)
+class BertScoreScorer(BaseScorer):
+    def __init__(self, cfg):
+        super(BertScoreScorer, self).__init__(cfg)
+        try:
+            import bert_score as _bert_score
+        except ImportError:
+            raise ImportError("Please install BERTScore: pip install bert-score")
+
+        self.cfg = cfg
+        self._bert_score = _bert_score
+        self.scores = None
+
+    def add_string(self, ref, pred):
+        self.ref.append(ref)
+        self.pred.append(pred)
+
+    def score(self, order=4):
+        _, _, self.scores = self._bert_score.score(
+            self.pred, self.ref, lang=self.cfg.bert_score_lang
+        )
+        self.scores = self.scores.numpy()
+        return np.mean(self.scores)
+
+    def result_string(self, order=4):
+        return f"BERTScore: {self.score():.4f}"
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
new file mode 100644
index 0000000000..e55bd2f393
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import ctypes
+import math
+import sys
+from dataclasses import dataclass, field
+
+import torch
+from fairseq.dataclass import FairseqDataclass
+from fairseq.scoring import BaseScorer, register_scorer
+from fairseq.scoring.tokenizer import EvaluationTokenizer
+
+
+class BleuStat(ctypes.Structure):
+    _fields_ = [
+        ("reflen", ctypes.c_size_t),
+        ("predlen", ctypes.c_size_t),
+        ("match1", ctypes.c_size_t),
+        ("count1", ctypes.c_size_t),
+        ("match2", ctypes.c_size_t),
+        ("count2", ctypes.c_size_t),
+        ("match3", ctypes.c_size_t),
+        ("count3", ctypes.c_size_t),
+        ("match4", ctypes.c_size_t),
+        ("count4", ctypes.c_size_t),
+    ]
+
+
+@dataclass
+class SacrebleuConfig(FairseqDataclass):
+    sacrebleu_tokenizer: EvaluationTokenizer.ALL_TOKENIZER_TYPES = field(
+        default="13a", metadata={"help": "tokenizer"}
+    )
+    sacrebleu_lowercase: bool = field(
+        default=False, metadata={"help": "apply lowercasing"}
+    )
+    sacrebleu_char_level: bool = field(
+        default=False, metadata={"help": "evaluate at character level"}
+    )
+
+
+@register_scorer("sacrebleu", dataclass=SacrebleuConfig)
+class SacrebleuScorer(BaseScorer):
+    def __init__(self, cfg):
+        super(SacrebleuScorer, self).__init__(cfg)
+        import sacrebleu
+
+        self.sacrebleu = sacrebleu
+        self.tokenizer = EvaluationTokenizer(
+            tokenizer_type=cfg.sacrebleu_tokenizer,
+            lowercase=cfg.sacrebleu_lowercase,
+            character_tokenization=cfg.sacrebleu_char_level,
+        )
+
+    def add_string(self, ref, pred):
+        self.ref.append(self.tokenizer.tokenize(ref))
+        self.pred.append(self.tokenizer.tokenize(pred))
+
+    def _score(self, order=4):
+        if order != 4:
+            raise NotImplementedError
+        # tokenization and lowercasing are performed by self.tokenizer instead.
+        return self.sacrebleu.corpus_bleu(self.pred, [self.ref], tokenize="none")
+
+    def score(self, order=4):
+        return self._score(order).score
+
+    def result_string(self, order=4):
+        return self._score(order).format()
+
+
+@dataclass
+class BleuConfig(FairseqDataclass):
+    pad: int = field(default=1, metadata={"help": "padding index"})
+    eos: int = field(default=2, metadata={"help": "eos index"})
+    unk: int = field(default=3, metadata={"help": "unk index"})
+
+
+@register_scorer("bleu", dataclass=BleuConfig)
+class Scorer(object):
+    def __init__(self, cfg):
+        self.stat = BleuStat()
+        self.pad = cfg.pad
+        self.eos = cfg.eos
+        self.unk = cfg.unk
+
+        try:
+            from fairseq import libbleu
+        except ImportError as e:
+            sys.stderr.write(
+                "ERROR: missing libbleu.so. run `pip install --editable .`\n"
+            )
+            raise e
+
+        self.C = ctypes.cdll.LoadLibrary(libbleu.__file__)
+
+        self.reset()
+
+    def reset(self, one_init=False):
+        if one_init:
+            self.C.bleu_one_init(ctypes.byref(self.stat))
+        else:
+            self.C.bleu_zero_init(ctypes.byref(self.stat))
+
+    def add(self, ref, pred):
+        if not isinstance(ref, torch.IntTensor):
+            raise TypeError("ref must be a torch.IntTensor (got {})".format(type(ref)))
+        if not isinstance(pred, torch.IntTensor):
+            raise TypeError("pred must be a torch.IntTensor(got {})".format(type(pred)))
+
+        # don't match unknown words
+        rref = ref.clone()
+        assert not rref.lt(0).any()
+        rref[rref.eq(self.unk)] = -999
+
+        rref = rref.contiguous().view(-1)
+        pred = pred.contiguous().view(-1)
+
+        self.C.bleu_add(
+            ctypes.byref(self.stat),
+            ctypes.c_size_t(rref.size(0)),
+            ctypes.c_void_p(rref.data_ptr()),
+            ctypes.c_size_t(pred.size(0)),
+            ctypes.c_void_p(pred.data_ptr()),
+            ctypes.c_int(self.pad),
+            ctypes.c_int(self.eos),
+        )
+
+    def score(self, order=4):
+        psum = sum(
+            math.log(p) if p > 0 else float("-Inf") for p in self.precision()[:order]
+        )
+        return self.brevity() * math.exp(psum / order) * 100
+
+    def precision(self):
+        def ratio(a, b):
+            return a / b if b > 0 else 0
+
+        return [
+            ratio(self.stat.match1, self.stat.count1),
+            ratio(self.stat.match2, self.stat.count2),
+            ratio(self.stat.match3, self.stat.count3),
+            ratio(self.stat.match4, self.stat.count4),
+        ]
+
+    def brevity(self):
+        r = self.stat.reflen / self.stat.predlen
+        return min(1, math.exp(1 - r))
+
+    def result_string(self, order=4):
+        assert order <= 4, "BLEU scores for order > 4 aren't supported"
+        fmt = "BLEU{} = {:2.2f}, {:2.1f}"
+        for _ in range(1, order):
+            fmt += "/{:2.1f}"
+        fmt += " (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})"
+        bleup = [p * 100 for p in self.precision()[:order]]
+        return fmt.format(
+            order,
+            self.score(order=order),
+            *bleup,
+            self.brevity(),
+            self.stat.predlen / self.stat.reflen,
+            self.stat.predlen,
+            self.stat.reflen
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
new file mode 100644
index 0000000000..5df5a1c011
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.scoring import BaseScorer, register_scorer
+
+
+@dataclass
+class ChrFScorerConfig(FairseqDataclass):
+    pass
+
+
+@register_scorer("chrf", dataclass=ChrFScorerConfig)
+class ChrFScorer(BaseScorer):
+    def __init__(self, args):
+        super(ChrFScorer, self).__init__(args)
+        import sacrebleu
+
+        self.sacrebleu = sacrebleu
+
+    def add_string(self, ref, pred):
+        self.ref.append(ref)
+        self.pred.append(pred)
+
+    def score(self, order=4):
+        return self.result_string(order).score
+
+    def result_string(self, order=4):
+        if order != 4:
+            raise NotImplementedError
+        return self.sacrebleu.corpus_chrf(self.pred, [self.ref]).format()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
new file mode 100644
index 0000000000..32719956fe
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+from dataclasses import dataclass
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.scoring import BaseScorer, register_scorer
+
+
+@dataclass
+class MeteorScorerConfig(FairseqDataclass):
+    pass
+
+
+@register_scorer("meteor", dataclass=MeteorScorerConfig)
+class MeteorScorer(BaseScorer):
+    def __init__(self, args):
+        super(MeteorScorer, self).__init__(args)
+        try:
+            import nltk
+        except ImportError:
+            raise ImportError("Please install nltk to use METEOR scorer")
+
+        self.nltk = nltk
+        self.scores = []
+
+    def add_string(self, ref, pred):
+        self.ref.append(ref)
+        self.pred.append(pred)
+
+    def score(self, order=4):
+        self.scores = [
+            self.nltk.translate.meteor_score.single_meteor_score(r, p)
+            for r, p in zip(self.ref, self.pred)
+        ]
+        return np.mean(self.scores)
+
+    def result_string(self, order=4):
+        return f"METEOR: {self.score():.4f}"
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
new file mode 100644
index 0000000000..b0cedd5099
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unicodedata
+
+import sacrebleu as sb
+
+from fairseq.dataclass import ChoiceEnum
+
+SACREBLEU_V2_ABOVE = int(sb.__version__[0]) >= 2
+
+
+class EvaluationTokenizer(object):
+    """A generic evaluation-time tokenizer, which leverages built-in tokenizers
+    in sacreBLEU (https://github.com/mjpost/sacrebleu). It additionally provides
+    lowercasing, punctuation removal and character tokenization, which are
+    applied after sacreBLEU tokenization.
+
+    Args:
+        tokenizer_type (str): the type of sacreBLEU tokenizer to apply.
+        lowercase (bool): lowercase the text.
+        punctuation_removal (bool): remove punctuation (based on unicode
+        category) from text.
+        character_tokenization (bool): tokenize the text to characters.
+    """
+
+    SPACE = chr(32)
+    SPACE_ESCAPE = chr(9601)
+    _ALL_TOKENIZER_TYPES = (
+        sb.BLEU.TOKENIZERS
+        if SACREBLEU_V2_ABOVE
+        else ["none", "13a", "intl", "zh", "ja-mecab"]
+    )
+    ALL_TOKENIZER_TYPES = ChoiceEnum(_ALL_TOKENIZER_TYPES)
+
+    def __init__(
+        self,
+        tokenizer_type: str = "13a",
+        lowercase: bool = False,
+        punctuation_removal: bool = False,
+        character_tokenization: bool = False,
+    ):
+
+        assert (
+            tokenizer_type in self._ALL_TOKENIZER_TYPES
+        ), f"{tokenizer_type}, {self._ALL_TOKENIZER_TYPES}"
+        self.lowercase = lowercase
+        self.punctuation_removal = punctuation_removal
+        self.character_tokenization = character_tokenization
+        if SACREBLEU_V2_ABOVE:
+            self.tokenizer = sb.BLEU(tokenize=str(tokenizer_type)).tokenizer
+        else:
+            self.tokenizer = sb.tokenizers.TOKENIZERS[tokenizer_type]()
+
+    @classmethod
+    def remove_punctuation(cls, sent: str):
+        """Remove punctuation based on Unicode category."""
+        return cls.SPACE.join(
+            t
+            for t in sent.split(cls.SPACE)
+            if not all(unicodedata.category(c)[0] == "P" for c in t)
+        )
+
+    def tokenize(self, sent: str):
+        tokenized = self.tokenizer(sent)
+
+        if self.punctuation_removal:
+            tokenized = self.remove_punctuation(tokenized)
+
+        if self.character_tokenization:
+            tokenized = self.SPACE.join(
+                list(tokenized.replace(self.SPACE, self.SPACE_ESCAPE))
+            )
+
+        if self.lowercase:
+            tokenized = tokenized.lower()
+
+        return tokenized
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
new file mode 100644
index 0000000000..633dc47c24
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.scoring import BaseScorer, register_scorer
+from fairseq.scoring.tokenizer import EvaluationTokenizer
+
+
+@dataclass
+class WerScorerConfig(FairseqDataclass):
+    wer_tokenizer: EvaluationTokenizer.ALL_TOKENIZER_TYPES = field(
+        default="none", metadata={"help": "sacreBLEU tokenizer to use for evaluation"}
+    )
+    wer_remove_punct: bool = field(
+        default=False, metadata={"help": "remove punctuation"}
+    )
+    wer_char_level: bool = field(
+        default=False, metadata={"help": "evaluate at character level"}
+    )
+    wer_lowercase: bool = field(default=False, metadata={"help": "lowercasing"})
+
+
+@register_scorer("wer", dataclass=WerScorerConfig)
+class WerScorer(BaseScorer):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.reset()
+        try:
+            import editdistance as ed
+        except ImportError:
+            raise ImportError("Please install editdistance to use WER scorer")
+        self.ed = ed
+        self.tokenizer = EvaluationTokenizer(
+            tokenizer_type=self.cfg.wer_tokenizer,
+            lowercase=self.cfg.wer_lowercase,
+            punctuation_removal=self.cfg.wer_remove_punct,
+            character_tokenization=self.cfg.wer_char_level,
+        )
+
+    def reset(self):
+        self.distance = 0
+        self.ref_length = 0
+
+    def add_string(self, ref, pred):
+        ref_items = self.tokenizer.tokenize(ref).split()
+        pred_items = self.tokenizer.tokenize(pred).split()
+        self.distance += self.ed.eval(ref_items, pred_items)
+        self.ref_length += len(ref_items)
+
+    def result_string(self):
+        return f"WER: {self.score():.2f}"
+
+    def score(self):
+        return 100.0 * self.distance / self.ref_length if self.ref_length > 0 else 0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
new file mode 100644
index 0000000000..c7378bbb51
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
@@ -0,0 +1,892 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from fairseq.token_generation_constraints import (
+    ConstraintState,
+    OrderedConstraintState,
+    UnorderedConstraintState,
+)
+from torch import Tensor
+
+
+class Search(nn.Module):
+    def __init__(self, tgt_dict):
+        super().__init__()
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos()
+        self.vocab_size = len(tgt_dict)
+        self.src_lengths = torch.tensor(-1)
+        self.supports_constraints = False
+        self.stop_on_max_len = False
+
+    def step(
+        self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None
+    ):
+        """Take a single search step.
+
+        Args:
+            step: the current search step, starting at 0
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+            scores: (bsz x input_beam_size x step)
+                the historical model scores of each hypothesis up to this point
+            prev_output_tokens: (bsz x step)
+                the previously generated oputput tokens
+            original_batch_idxs: (bsz)
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the orignal indices
+
+        Return: A tuple of (scores, indices, beams) where:
+            scores: (bsz x output_beam_size)
+                the scores of the chosen elements; output_beam_size can be
+                larger than input_beam_size, e.g., we may return
+                2*input_beam_size to account for EOS
+            indices: (bsz x output_beam_size)
+                the indices of the chosen elements
+            beams: (bsz x output_beam_size)
+                the hypothesis ids of the chosen elements, in the range [0, input_beam_size)
+        """
+        raise NotImplementedError
+
+    @torch.jit.export
+    def set_src_lengths(self, src_lengths):
+        self.src_lengths = src_lengths
+
+    @torch.jit.export
+    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
+        """Initialize constraint states for constrained decoding (if supported).
+
+        Args:
+            batch_constraints: (torch.Tensor, optional)
+                the list of constraints, in packed form
+            beam_size: (int)
+                the beam size
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        pass
+
+    def prune_sentences(self, batch_idxs: Tensor):
+        """
+        Removes constraint states for completed sentences (if supported).
+        This is called from sequence_generator._generate() when sentences are
+        deleted from the batch.
+
+        Args:
+            batch_idxs: Indices of *sentences* whose constraint state should be *kept*.
+        """
+        pass
+
+    def update_constraints(self, active_hypos: Tensor):
+        """
+        Updates the constraint states by selecting the beam items that are retained.
+        This is called at each time step of sequence_generator._generate() when
+        the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size.
+
+        Args:
+            active_hypos: (batch size, beam size)
+              list of integers denoting, for each sentence, which beam candidate items
+              should be kept.
+        """
+        pass
+
+
+class BeamSearch(Search):
+    def __init__(self, tgt_dict):
+        super().__init__(tgt_dict)
+        self.constraint_states = None
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores: Optional[Tensor],
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+        candidate_multiple: int = 2,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+
+        top_prediction = torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best `candidate_muliple`(default 2) x beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                candidate_multiple * beam_size,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+        )
+        scores_buf = top_prediction[0]
+        indices_buf = top_prediction[1]
+        # Project back into relative indices and beams
+        beams_buf = torch.div(indices_buf, vocab_size, rounding_mode="trunc")
+        indices_buf = indices_buf.fmod(vocab_size)
+
+        # At this point, beams_buf and indices_buf are single-dim and contain relative indices
+        return scores_buf, indices_buf, beams_buf
+
+
+class PrefixConstrainedBeamSearch(Search):
+    def __init__(self, tgt_dict, prefix_allowed_tokens_fn):
+        super().__init__(tgt_dict)
+        self.prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
+        self.stop_on_max_len = True
+
+    @torch.jit.export
+    def apply_mask(self, x, prev_output_tokens, original_batch_idxs):
+        beam_size = x.shape[0] // original_batch_idxs.shape[0]
+        original_batch_idxs = (
+            original_batch_idxs.unsqueeze(-1).repeat((1, beam_size)).flatten().tolist()
+        )
+
+        mask = torch.full_like(x, -math.inf)
+        for sent_i, (sent, batch_i) in enumerate(
+            zip(prev_output_tokens, original_batch_idxs)
+        ):
+            mask[sent_i, :, self.prefix_allowed_tokens_fn(batch_i, sent)] = 0
+
+        return mask
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs: Tensor,
+        scores: Tensor,
+        prev_output_tokens: Tensor,
+        original_batch_idxs: Tensor,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        lprobs += self.apply_mask(
+            lprobs.view(bsz * beam_size, 1, vocab_size),
+            prev_output_tokens,
+            original_batch_idxs,
+        ).view(bsz, beam_size, vocab_size)
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+
+        top_prediction = torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+        )
+        scores_buf = top_prediction[0]
+        indices_buf = top_prediction[1]
+        beams_buf = indices_buf // vocab_size
+        indices_buf = indices_buf.fmod(vocab_size)
+        return scores_buf, indices_buf, beams_buf
+
+
+class LexicallyConstrainedBeamSearch(Search):
+    """Implements lexically constrained beam search as described in
+
+        Fast Lexically Constrained Decoding with Dynamic Beam
+        Allocation for Neural Machine Translation.  Post & Vilar,
+        NAACL 2018.  https://www.aclweb.org/anthology/N18-1119/
+
+    and
+
+        Improved Lexically Constrained Decoding for Translation and
+        Monolingual Rewriting. Hu et al, NAACL
+        2019. https://www.aclweb.org/anthology/N19-1090/
+
+    This is accomplished by maintaining, for each beam hypothesis, a
+    ConstraintState object (see constraints.py) that tracks which
+    constraints have been generated and using this information to
+    shape the beam for each input sentence.
+    """
+
+    def __init__(self, tgt_dict, representation):
+        super().__init__(tgt_dict)
+        self.representation = representation
+        self.vocab_size = len(tgt_dict)
+        self.num_cands = 0
+        self.supports_constraints = True
+
+    @torch.jit.export
+    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
+        self.constraint_states = []
+        for constraint_tensor in batch_constraints:
+            if self.representation == "ordered":
+                constraint_state = OrderedConstraintState.create(constraint_tensor)
+            elif self.representation == "unordered":
+                constraint_state = UnorderedConstraintState.create(constraint_tensor)
+
+            self.constraint_states.append([constraint_state for i in range(beam_size)])
+
+    @torch.jit.export
+    def prune_sentences(self, batch_idxs: Tensor):
+        self.constraint_states = [
+            self.constraint_states[i] for i in batch_idxs.tolist()
+        ]
+
+    @torch.jit.export
+    def update_constraints(self, active_hypos: Tensor):
+        if self.constraint_states:
+            batch_size = active_hypos.size(0)
+            for sentid in range(batch_size):
+                self.constraint_states[sentid] = [
+                    self.constraint_states[sentid][i] for i in active_hypos[sentid]
+                ]
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs: Tensor,
+        scores: Optional[Tensor],
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        """
+        A constrained step builds a large candidates list from the following:
+        - the top 2 * {beam_size} items over the whole beam
+        - for each item in the beam
+          - the top {each_k} (default 1)
+          - all next constraints
+        We then compute the constrained state of each beam item, and assign
+        stripe codes: 0 to the best in each bank, 1 to the 2nd-best, and so
+        on. We then sort by (stripe, score), and truncate the list at
+        2 * beam size.
+
+        Args:
+            step: the decoder step
+            lprobs: (batch size, beam size, target vocab)
+                the target-vocab distributions for each item in the beam.
+        Retrun: A tuple of (scores, indices, beams, constraints) where:
+            scores: (batch, output beam size)
+                the scores of the chosen elements
+            indices: (batch, output beam size)
+                the target vocab indices of the chosen elements
+            beams: (batch, output beam size)
+                the 0-indexed hypothesis ids of the chosen elements
+            constraints: (batch, output beam size)
+                the new constraint states
+        """
+        each_k = 1
+        device = lprobs.device
+
+        batch_size, beam_size, vocab_size = lprobs.size()
+
+        self.num_cands = min(
+            # Just take the k-best. We'll get another k from the 1-best from each
+            # row, plus more from the constraints
+            beam_size * 2,
+            lprobs.view(batch_size, -1).size(1) - 1,  # -1 so we never select pad
+        )
+
+        # STEP 0: Preliminary. Prevent EOS for unfinished hyps across all batch items
+        constraint_states = self.constraint_states
+        if constraint_states and step > 0:
+            not_finished_indices = []
+            for sentno, sent_constraints in enumerate(constraint_states):
+                for beamno, state in enumerate(sent_constraints):
+                    index = sentno * beam_size + beamno
+                    if not state.finished:
+                        not_finished_indices.append(index)
+            not_finished_indices = torch.tensor(not_finished_indices)
+            if not_finished_indices.numel() > 0:
+                lprobs.view(batch_size * beam_size, -1)[
+                    not_finished_indices, self.eos
+                ] = -math.inf
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam entry for each batch item
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            assert scores is not None
+            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
+
+        top_prediction = torch.topk(
+            lprobs.view(batch_size, -1),
+            self.num_cands,
+        )
+        scores_buf, indices_buf = top_prediction
+        # Project back into relative indices and beams
+        beams_buf = indices_buf // vocab_size
+        indices_buf = indices_buf.fmod(vocab_size)
+
+        # Short circuit if there are no constraints in this batch
+        if not constraint_states:
+            return scores_buf, indices_buf, beams_buf
+
+        # STEP 1: get top-1 from each hypothesis across all sentences in the batch
+        if step > 0:
+            top_scores, top_indices = torch.topk(
+                lprobs.view(batch_size * beam_size, -1),
+                k=each_k,
+                dim=1,
+            )
+            top_scores = top_scores.view(batch_size, -1)
+            top_indices = top_indices.view(batch_size, -1)
+            scores_buf = torch.cat((scores_buf, top_scores), dim=1)
+            indices_buf = torch.cat((indices_buf, top_indices), dim=1)
+            new_beams = torch.arange(0, beam_size, device=device).repeat(batch_size, 1)
+            beams_buf = torch.cat((beams_buf, new_beams), dim=1)
+
+        # Now, process sentences in the batch one by one.
+        new_scores_buf = torch.zeros((batch_size, 2 * beam_size), device=device)
+        new_indices_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long()
+        new_beams_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long()
+        for sentno, states in enumerate(constraint_states):
+            scores, indices, beams, new_states = self.step_sentence(
+                step,
+                sentno,
+                lprobs[sentno],
+                constraint_states[sentno],
+                beams_buf[sentno].clone(),
+                indices_buf[sentno].clone(),
+                scores_buf[sentno].clone(),
+            )
+            new_scores_buf[sentno] = scores
+            new_indices_buf[sentno] = indices
+            new_beams_buf[sentno] = beams
+            self.constraint_states[sentno] = new_states
+
+        return new_scores_buf, new_indices_buf, new_beams_buf
+
+    @torch.jit.export
+    def step_sentence(
+        self,
+        step: int,
+        sentno: int,
+        lprobs: Tensor,
+        constraint_states: List[List[ConstraintState]],
+        beams_buf: Tensor,
+        indices_buf: Tensor,
+        scores_buf: Tensor,
+    ):
+        """Does per-sentence processing. Adds all constraints for each
+        hypothesis to the list of candidates; then removes duplicates,
+        sorts, and dynamically stripes across the banks. All tensor inputs
+        are collapsed to those pertaining to a single input sentence.
+        """
+        device = lprobs.device
+
+        # STEP 2: Add all constraints for each beam item
+        for beamno, state in enumerate(constraint_states):
+            next_tokens = torch.tensor(list(state.next_tokens()), device=device).long()
+            if next_tokens.numel() != 0:
+                indices_buf = torch.cat((indices_buf, next_tokens))
+                next_beams = (
+                    torch.tensor(beamno, device=device)
+                    .repeat(next_tokens.size(0))
+                    .long()
+                )
+                beams_buf = torch.cat((beams_buf, next_beams))
+                next_values = lprobs[beamno].take(next_tokens.view(-1))
+                scores_buf = torch.cat((scores_buf, next_values))
+
+            # At the 0th time step, there is just one beam item
+            if step == 0:
+                break
+
+        # STEP 3: Compute the "bank" for each candidate. This is the
+        # number of constraints it's generated. We need this so that
+        # we can do round-robin allocation of the beam across these
+        # banks. If C is the number of constraints, we select the best
+        # item in bank C, then the best in bank C-1, etc, followed by
+        # the 2nd-best in bank C, the 2nd-best in bank C-1, etc, and so
+        # on, until the maximum beam size. We accomplish this by
+        # creating a sort key and striping across the banks.
+
+        # Compute the new states for all candidates
+        cands_size = indices_buf.size(0)
+        constraint_states = [
+            constraint_states[beams_buf[i]].advance(indices_buf[i])
+            for i in range(cands_size)
+        ]
+
+        banks = torch.tensor([state.bank for state in constraint_states], device=device)
+
+        # STEP 4: Sort
+        num_constraint_tokens = len(state.tokens)
+
+        # Sort by keys (bank, score) (i.e., sort banks together, and scores
+        # within banks). AFAIK pytorch doesn't support either stable sort or
+        # multi-key sorting, so we have to hack this.
+        MAX_SCORE = -100
+        sort_key = (num_constraint_tokens - banks) * MAX_SCORE + scores_buf
+        sort_values, sort_indices = sort_key.sort(dim=0, descending=True)
+        scores_buf = scores_buf[sort_indices]
+        indices_buf = indices_buf[sort_indices]
+        beams_buf = beams_buf[sort_indices]
+        banks = banks[sort_indices]
+
+        # Sort the constraints to follow suit
+        constraint_states = [constraint_states[i] for i in sort_indices]
+
+        # STEP 5: Remove duplicates. The topk calls (overall and
+        # per-row) plus the per-row generation of constraints will
+        # produce duplicates. Here we remove them.
+
+        def roll(t):
+            """Rolls a 1d tensor left by 1.
+
+            [0, 1, 2, 3, 4] becomes [4, 0, 1, 2, 3]
+            """
+            return torch.cat((t[-1].unsqueeze(0), t[0:-1]), dim=0)
+
+        # We map candidates (beam, token_id) to a single dimension.
+        # This is then shifted by 1. We can then easily identify
+        # duplicates and create a mask that identifies unique
+        # extensions.
+        uniques_mask = beams_buf * (self.vocab_size + 1) + indices_buf
+        uniques_mask = roll(uniques_mask) != uniques_mask
+
+        # Use the mask to pare down the data structures
+        scores_buf = torch.masked_select(scores_buf, uniques_mask)
+        indices_buf = torch.masked_select(indices_buf, uniques_mask)
+        beams_buf = torch.masked_select(beams_buf, uniques_mask)
+        banks = torch.masked_select(banks, uniques_mask)
+        i = 1
+        for mask in uniques_mask[1:]:
+            if not mask:
+                constraint_states.pop(i)
+            i += mask
+
+        # STEP 6: Assign IDs round-robin across banks, sort, and
+        # truncate. Now that the candidates are sorted by (bank,
+        # score) and uniqed, we dynamically allocate the {beam_size}
+        # beam by striping across the candidates. These stripes will
+        # be used as sort keys to do round-robin selection. This is
+        # accomplished in a single pass with offsets. Sorting by
+        # highest-banks (furthest-along hypotheses) first ensures
+        # progress through the constraints.
+        #
+        # e.g., BANKS: 3 3 3 2 2 2 2 1 1 1 0 0
+        # OLD STRIPES: 0 1 2 0 1 2 3 0 1 2 0 1
+        # NEW STRIPES: 0 1+4 2+8 0+1 1+5 2+9 3+11 0+2 1+6 2+10 0+3 1+7
+        #            = 0 5 10 1 6 11 13 2 7 12 3 8
+        #
+        # Sorting by this then gives the following banks:
+        #
+        #             3 2 1 0 3 2 1 0 3 2 1 2
+        #
+        # We'll take the top {beam_size} of these.
+        stripe_offsets = [offset * (len(banks) + 1) for offset in range(len(banks) + 1)]
+        stripes = torch.zeros_like(banks)
+        cur_bank_count = -1
+        cur_bank = banks[0]
+        for i, bank in enumerate(banks):
+            if bank != cur_bank:
+                cur_bank_count = 0
+                cur_bank = bank
+            else:
+                cur_bank_count += 1
+            stripes[i] = num_constraint_tokens - bank + stripe_offsets[cur_bank_count]
+
+        # STEP 7: Sort by the stripes values
+        sort_values, sort_indices = stripes.sort(dim=0)
+        scores_buf = scores_buf[sort_indices]
+        indices_buf = indices_buf[sort_indices]
+        beams_buf = beams_buf[sort_indices]
+        constraint_states = [constraint_states[i] for i in sort_indices]
+
+        # STEP 8: Truncate to the candidates size!
+        scores_buf = scores_buf[: self.num_cands]
+        indices_buf = indices_buf[: self.num_cands]
+        beams_buf = beams_buf[: self.num_cands]
+
+        return scores_buf, indices_buf, beams_buf, constraint_states
+
+
+class LengthConstrainedBeamSearch(Search):
+    def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b):
+        super().__init__(tgt_dict)
+        self.min_len_a = min_len_a
+        self.min_len_b = min_len_b
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.beam = BeamSearch(tgt_dict)
+        self.needs_src_lengths = True
+
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        min_lens = self.min_len_a * self.src_lengths + self.min_len_b
+        max_lens = self.max_len_a * self.src_lengths + self.max_len_b
+        lprobs[step < min_lens, :, self.eos] = -math.inf
+        lprobs[step >= max_lens, :, self.eos] = 0
+        return self.beam.step(step, lprobs, scores)
+
+
+class DiverseBeamSearch(Search):
+    """Diverse Beam Search.
+
+    See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
+    Models" for details.
+
+    We implement cumulative diversity penalty here as default, optionally provide Hamming diversity described
+    in the original paper, and a way to interpolate between the two through diversity_discount.
+
+    Take the example below for illustration of cumulative diversity implemented.
+    A) I like dogs.
+    B) I like ____.
+    C) There are ___.
+    And we are at step=2, trying to fill in the blank:
+
+    Hamming diversity:
+    Penalty for B from A is 1 for "dogs" and 0 for any other words like "cats".
+    Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats".
+
+    Cumulative diversity (default):
+    Penalty for B from A is 3 for "dogs" and 0 for any other words like "cats".
+    Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats".
+    B and C differ because B matches with A for "I" and "like" at respective steps incurring 2 cumulative penalty.
+
+    Using divesrity_discount to interpolate between the two:
+    if diverstiy_discount = 0.5, then
+    Penalty for B from A is 1.75 (1 + 0.5 + 0.25) for "dogs" and 0 for any other words like "cats".
+    Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats".
+    "I" and "like" matched for B and A at step 0 and 1 respectively. Since "I" is two steps away and "like" is one step away, they are discounted by (0.5)^2 and 0.5 respectively.
+    When diversity_discount = 0, we recover Hammning diversity and when diversity_discount = 1, we recover cumulative diversity.
+
+    NB: During beam search for each diversity group, `candidate_mutiple` is set to 1 rather than BeamSearch default(2).
+    This is to ensure we have final `beam_size` candidates so that no diversity groups would be dropped during final token selection in sequence generation.
+    For full backwards compatibility, use diversity_discount=0 and candidate_multiple=2.
+
+    """
+
+    def __init__(
+        self,
+        tgt_dict,
+        num_groups,
+        diversity_strength,
+        diversity_discount=1.0,
+        candidate_multiple=1,
+    ):
+        super().__init__(tgt_dict)
+        self.num_groups = num_groups
+        self.diversity_strength = -diversity_strength
+        self.beam = BeamSearch(tgt_dict)
+        self.diversity_discount = diversity_discount
+        self.candidate_multiple = candidate_multiple
+
+        # Float tensor to keep track of overlap between groups.
+        # Each token shared at the same step between two groups is counted as one.
+        # Then token counts are discounted by `diversity_discount` for every next timestep.
+        # Once initialized, dimension is batch_size * num_groups * num_groups.
+        self.group_overlap = torch.empty(0)
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        if beam_size % self.num_groups != 0:
+            raise ValueError(
+                "DiverseBeamSearch requires --beam to be divisible by the number of groups"
+            )
+
+        # initialize diversity penalty
+        diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs)
+
+        scores_G, beams_G = [], []
+
+        # pre-allocating tensor for indices for all groups
+        indices_G_stacked = torch.empty(
+            bsz,
+            int(beam_size / self.num_groups) * self.candidate_multiple,
+            self.num_groups,
+            dtype=torch.long,
+            device=lprobs.device,
+        )
+
+        for g in range(self.num_groups):
+            lprobs_g = lprobs[:, g :: self.num_groups, :]
+            scores_g = scores[:, g :: self.num_groups, :] if step > 0 else None
+
+            diversity_buf.zero_()
+            # apply diversity penalty
+            if g > 0:
+                indices_ = indices_G_stacked[:, :, :g]
+                if step > 0:
+                    penalty_val = 1 + self.group_overlap[original_batch_idxs, g, :g]
+                    penalty_val = penalty_val.unsqueeze(1)
+                else:
+                    penalty_val = torch.ones(bsz, 1, 1)
+                diversity_buf.scatter_add_(
+                    1,
+                    indices_.reshape(bsz, -1),
+                    penalty_val.expand(indices_.size())
+                    .reshape(bsz, -1)
+                    .to(diversity_buf),
+                )
+
+                lprobs_g = torch.add(
+                    lprobs_g,
+                    other=diversity_buf.unsqueeze(1),
+                    alpha=self.diversity_strength,
+                )
+            else:
+                lprobs_g = lprobs_g.contiguous()
+
+            scores_buf, indices_buf, beams_buf = self.beam.step(
+                step, lprobs_g, scores_g, candidate_multiple=self.candidate_multiple
+            )
+            beams_buf.mul_(self.num_groups).add_(g)
+
+            scores_G.append(scores_buf.clone())
+            beams_G.append(beams_buf.clone())
+
+            indices_G_stacked[:, :, g] = indices_buf
+
+        # interleave results from different groups
+        scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1)
+        indices_buf = indices_G_stacked.view(bsz, -1)
+        beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1)
+        # find num of overlapped tokens for each group pair
+        # then discount it for next timestamp
+        overlap = self.diversity_discount * torch.sum(
+            indices_G_stacked.unsqueeze(2).eq(indices_G_stacked.unsqueeze(3)), dim=1
+        )
+        if step == 0:
+            self.group_overlap = overlap
+        else:
+            self.group_overlap[original_batch_idxs] = (
+                self.group_overlap[original_batch_idxs] * self.diversity_discount
+                + overlap
+            )
+
+        return scores_buf, indices_buf, beams_buf
+
+
+class Sampling(Search):
+    sampling_topk: int
+    sampling_topp: float
+
+    def __init__(self, tgt_dict, sampling_topk=-1, sampling_topp=-1.0):
+        super().__init__(tgt_dict)
+        self.sampling_topk = sampling_topk
+        self.sampling_topp = sampling_topp
+
+    def _sample_topp(self, lprobs):
+        """Sample among the smallest set of elements whose cumulative probability mass exceeds p.
+
+        See `"The Curious Case of Neural Text Degeneration"
+        (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_.
+
+        Args:
+            lprobs: (bsz x input_beam_size x vocab_size)
+                the model's log-probabilities over the vocabulary at the current step
+
+        Return: A tuple of (trimed_probs, truncated_indices) where:
+            trimed_probs: (bsz x input_beam_size x ?)
+                the model's probabilities over the elements selected to sample from. The
+                width of the third dimension is determined by top-P.
+            truncated_indices: (bsz x input_beam_size x ?)
+                the indices of the chosen elements.
+        """
+        probs = lprobs.exp_()
+
+        # sort the last dimension (vocab dimension) in descending order
+        sorted_probs, sorted_indices = probs.sort(descending=True)
+
+        # compute a mask to indicate the words to be included in the top-P set.
+        cumsum_probs = sorted_probs.cumsum(dim=2)
+        mask = cumsum_probs.lt(self.sampling_topp)
+
+        # note that mask was computed by 'lt'. One more word needs to be included
+        # so that the cumulative probability mass can exceed p.
+        cumsum_mask = mask.cumsum(dim=2)
+        last_included = cumsum_mask[:, :, -1:]
+        last_included.clamp_(0, mask.size()[2] - 1)
+        mask = mask.scatter_(2, last_included, 1)
+
+        # truncate unnecessary dims.
+        max_dim = last_included.max()
+        truncated_mask = mask[:, :, : max_dim + 1]
+        truncated_probs = sorted_probs[:, :, : max_dim + 1]
+        truncated_indices = sorted_indices[:, :, : max_dim + 1]
+
+        # trim the words that are not in top-P by setting their probabilities
+        # to 0, so that they would not be sampled later.
+        trim_mask = ~truncated_mask
+        trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
+        return trimed_probs, truncated_indices
+
+    @torch.jit.export
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+
+        if self.sampling_topp > 0:
+            # only sample from the smallest set of words whose cumulative probability mass exceeds p
+            probs, top_indices = self._sample_topp(lprobs)
+        elif self.sampling_topk > 0:
+            # only sample from top-k candidates
+            lprobs, top_indices = lprobs.topk(self.sampling_topk)
+            probs = lprobs.exp_()
+        else:
+            probs = lprobs.exp_()
+
+            # dummy data to be consistent with true branch for type check
+            top_indices = torch.empty(0).to(probs)
+        # sample
+        if step == 0:
+            indices_buf = torch.multinomial(
+                probs.view(bsz, -1),
+                beam_size,
+                replacement=True,
+            ).view(bsz, beam_size)
+        else:
+            indices_buf = torch.multinomial(
+                probs.view(bsz * beam_size, -1),
+                1,
+                replacement=True,
+            ).view(bsz, beam_size)
+
+        if step == 0:
+            # expand to beam size
+            probs = probs.expand(bsz, beam_size, -1)
+
+        # gather scores
+        scores_buf = torch.gather(probs, dim=2, index=indices_buf.unsqueeze(-1))
+        scores_buf = scores_buf.log_().view(bsz, -1)
+
+        # remap indices if using top-k or top-P sampling
+        if self.sampling_topk > 0 or self.sampling_topp > 0:
+            indices_buf = torch.gather(
+                top_indices.expand(bsz, beam_size, -1),
+                dim=2,
+                index=indices_buf.unsqueeze(-1),
+            ).squeeze(2)
+
+        if step == 0:
+            beams_buf = indices_buf.new_zeros(bsz, beam_size)
+        else:
+            beams_buf = torch.arange(0, beam_size).to(indices_buf).repeat(bsz, 1)
+            # make scores cumulative
+            scores_buf.add_(
+                torch.gather(scores[:, :, step - 1], dim=1, index=beams_buf)
+            )
+
+        return scores_buf, indices_buf, beams_buf
+
+
+class DiverseSiblingsSearch(Search):
+    """
+    Beam search with diverse siblings.
+
+    See "A Simple, Fast Diverse Decoding Algorithm for Neural Generation" for details.
+    https://arxiv.org/abs/1611.08562
+
+    1/ Calculate hypotheses for each beam
+    2/ Intra-sibling ordering
+    3/ Rewrite scores
+    4/ Choose top K hypotheses
+
+    if diversity_rate == 0 is equivalent to BeamSearch
+    """
+
+    def __init__(self, tgt_dict, diversity_rate):
+        super().__init__(tgt_dict)
+        self.diversity_rate = diversity_rate
+        self.beam = BeamSearch(tgt_dict)
+
+    def step(
+        self,
+        step: int,
+        lprobs,
+        scores,
+        prev_output_tokens: Optional[Tensor] = None,
+        original_batch_idxs: Optional[Tensor] = None,
+    ):
+        bsz, beam_size, vocab_size = lprobs.size()
+        k = min(
+            # Take the best 2 x beam_size predictions. We'll choose the first
+            # beam_size of these which don't predict eos to continue with.
+            beam_size * 2,
+            lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+        )
+        s_list: List[Tensor]
+        i_list: List[Tensor]
+        s_list = [torch.empty(0).to(lprobs) for i in range(beam_size)]
+        i_list = [torch.LongTensor().to(device=lprobs.device) for i in range(beam_size)]
+        sibling_score = torch.arange(1, k + 1).to(lprobs) * self.diversity_rate
+
+        if step == 0:
+            return self.beam.step(step, lprobs, scores)
+        lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))
+
+        # 1/ Calculate hypotheses for each beam
+        for i in range(beam_size):
+            torch.topk(lprobs[:, i, :].view(bsz, -1), k, out=(s_list[i], i_list[i]))
+            i_list[i].fmod_(vocab_size)
+
+            # 2/ Intra-sibling ordering by default from topk + 3/ Rewrite scores
+            s_list[i].sub_(sibling_score)
+
+        # 4/ Choose top K hypotheses
+        indices = torch.stack(i_list, dim=1).view(bsz, -1)
+
+        final_scores = torch.empty(0).to(lprobs)
+        final_indices = torch.LongTensor().to(device=lprobs.device)
+        final_beams = torch.LongTensor().to(device=lprobs.device)
+        (final_scores, final_indices) = torch.topk(
+            torch.stack(s_list, dim=1).view(bsz, -1),
+            k,
+        )
+
+        final_beams = final_indices // k
+
+        for i in range(bsz):
+            final_indices[i] = indices[i][final_indices[i]]
+
+        return final_scores, final_indices, final_beams
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
new file mode 100644
index 0000000000..78db504e6c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
@@ -0,0 +1,1020 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import sys
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from fairseq import search, utils
+from fairseq.data import data_utils
+from fairseq.models import FairseqIncrementalDecoder
+from fairseq.ngram_repeat_block import NGramRepeatBlock
+
+
+class SequenceGenerator(nn.Module):
+    def __init__(
+        self,
+        models,
+        tgt_dict,
+        beam_size=1,
+        max_len_a=0,
+        max_len_b=200,
+        max_len=0,
+        min_len=1,
+        normalize_scores=True,
+        len_penalty=1.0,
+        unk_penalty=0.0,
+        temperature=1.0,
+        match_source_len=False,
+        no_repeat_ngram_size=0,
+        search_strategy=None,
+        eos=None,
+        symbols_to_strip_from_output=None,
+        lm_model=None,
+        lm_weight=1.0,
+        tokens_to_suppress=(),
+    ):
+        """Generates translations of a given source sentence.
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models,
+                currently support fairseq.models.TransformerModel for scripting
+            beam_size (int, optional): beam width (default: 1)
+            max_len_a/b (int, optional): generate sequences of maximum length
+                ax + b, where x is the source length
+            max_len (int, optional): the maximum length of the generated output
+                (not including end-of-sentence)
+            min_len (int, optional): the minimum length of the generated output
+                (not including end-of-sentence)
+            normalize_scores (bool, optional): normalize scores by the length
+                of the output (default: True)
+            len_penalty (float, optional): length penalty, where <1.0 favors
+                shorter, >1.0 favors longer sentences (default: 1.0)
+            unk_penalty (float, optional): unknown word penalty, where <0
+                produces more unks, >0 produces fewer (default: 0.0)
+            temperature (float, optional): temperature, where values
+                >1.0 produce more uniform samples and values <1.0 produce
+                sharper samples (default: 1.0)
+            match_source_len (bool, optional): outputs should match the source
+                length (default: False)
+        """
+        super().__init__()
+        if isinstance(models, EnsembleModel):
+            self.model = models
+        else:
+            self.model = EnsembleModel(models)
+        self.tgt_dict = tgt_dict
+        self.pad = tgt_dict.pad()
+        self.unk = tgt_dict.unk()
+        self.eos = tgt_dict.eos() if eos is None else eos
+        self.symbols_to_strip_from_output = (
+            symbols_to_strip_from_output.union({self.eos})
+            if symbols_to_strip_from_output is not None
+            else {self.eos}
+        )
+
+        self.token_indices_to_suppress: Optional[Tensor] = None
+        token_indices_to_suppress = []
+        for token_string in tokens_to_suppress:
+            token_index = tgt_dict.index(token_string)
+            assert token_index != self.unk
+            token_indices_to_suppress.append(token_index)
+        if len(token_indices_to_suppress) > 0:
+            self.token_indices_to_suppress = torch.Tensor(
+                token_indices_to_suppress
+            ).long()
+
+        self.vocab_size = len(tgt_dict)
+        self.beam_size = beam_size
+        # the max beam size is the dictionary size - 1, since we never select pad
+        self.beam_size = min(beam_size, self.vocab_size - 1)
+        self.model.set_decoder_beam_size(self.beam_size)
+        self.max_len_a = max_len_a
+        self.max_len_b = max_len_b
+        self.min_len = min_len
+        self.max_len = max_len or self.model.max_decoder_positions()
+
+        self.normalize_scores = normalize_scores
+        self.len_penalty = len_penalty
+        self.unk_penalty = unk_penalty
+        self.temperature = temperature
+        self.match_source_len = match_source_len
+
+        if no_repeat_ngram_size > 0:
+            self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size)
+        else:
+            self.repeat_ngram_blocker = None
+
+        assert temperature > 0, "--temperature must be greater than 0"
+
+        self.search = (
+            search.BeamSearch(tgt_dict) if search_strategy is None else search_strategy
+        )
+        # We only need to set src_lengths in LengthConstrainedBeamSearch.
+        # As a module attribute, setting it would break in multithread
+        # settings when the model is shared.
+        self.should_set_src_lengths = (
+            hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths
+        )
+
+        self.model.eval()
+
+        self.lm_model = lm_model
+        self.lm_weight = lm_weight
+        if self.lm_model is not None:
+            self.lm_model.eval()
+
+    def cuda(self):
+        self.model.cuda()
+        return self
+
+    @torch.no_grad()
+    def forward(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        """Generate a batch of translations.
+
+        Args:
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, prefix_tokens, bos_token=bos_token)
+
+    # TODO(myleott): unused, deprecate after pytorch-translate migration
+    def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None):
+        """Iterate over a batched dataset and yield individual translations.
+        Args:
+            cuda (bool, optional): use GPU for generation
+            timer (StopwatchMeter, optional): time generations
+        """
+        for sample in data_itr:
+            s = utils.move_to_cuda(sample) if cuda else sample
+            if "net_input" not in s:
+                continue
+            input = s["net_input"]
+            # model.forward normally channels prev_output_tokens into the decoder
+            # separately, but SequenceGenerator directly calls model.encoder
+            encoder_input = {
+                k: v for k, v in input.items() if k != "prev_output_tokens"
+            }
+            if timer is not None:
+                timer.start()
+            with torch.no_grad():
+                hypos = self.generate(encoder_input)
+            if timer is not None:
+                timer.stop(sum(len(h[0]["tokens"]) for h in hypos))
+            for i, id in enumerate(s["id"].data):
+                # remove padding
+                src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad)
+                ref = (
+                    utils.strip_pad(s["target"].data[i, :], self.pad)
+                    if s["target"] is not None
+                    else None
+                )
+                yield id, src, ref, hypos[i]
+
+    @torch.no_grad()
+    def generate(
+        self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs
+    ) -> List[List[Dict[str, Tensor]]]:
+        """Generate translations. Match the api of other fairseq generators.
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            sample (dict): batch
+            prefix_tokens (torch.LongTensor, optional): force decoder to begin
+                with these tokens
+            constraints (torch.LongTensor, optional): force decoder to include
+                the list of constraints
+            bos_token (int, optional): beginning of sentence token
+                (default: self.eos)
+        """
+        return self._generate(sample, **kwargs)
+
+    def _generate(
+        self,
+        sample: Dict[str, Dict[str, Tensor]],
+        prefix_tokens: Optional[Tensor] = None,
+        constraints: Optional[Tensor] = None,
+        bos_token: Optional[int] = None,
+    ):
+        incremental_states = torch.jit.annotate(
+            List[Dict[str, Dict[str, Optional[Tensor]]]],
+            [
+                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
+                for i in range(self.model.models_size)
+            ],
+        )
+        net_input = sample["net_input"]
+
+        if "src_tokens" in net_input:
+            src_tokens = net_input["src_tokens"]
+            # length of the source text being the character length except EndOfSentence and pad
+            # if src_lengths exists in net_input (speech_to_text dataset case), then use it
+            if "src_lengths" in net_input:
+                src_lengths = net_input["src_lengths"]
+            else:
+                src_lengths = (
+                    (src_tokens.ne(self.eos) & src_tokens.ne(self.pad))
+                    .long()
+                    .sum(dim=1)
+                )
+        elif "source" in net_input:
+            src_tokens = net_input["source"]
+            src_lengths = (
+                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
+                if net_input["padding_mask"] is not None
+                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
+            )
+        elif "features" in net_input:
+            src_tokens = net_input["features"]
+            src_lengths = (
+                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
+                if net_input["padding_mask"] is not None
+                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
+            )
+        else:
+            raise Exception(
+                "expected src_tokens or source in net input. input keys: "
+                + str(net_input.keys())
+            )
+
+        # bsz: total number of sentences in beam
+        # Note that src_tokens may have more than 2 dimensions (i.e. audio features)
+        bsz, src_len = src_tokens.size()[:2]
+        beam_size = self.beam_size
+
+        if constraints is not None and not self.search.supports_constraints:
+            raise NotImplementedError(
+                "Target-side constraints were provided, but search method doesn't support them"
+            )
+
+        # Initialize constraints, when active
+        self.search.init_constraints(constraints, beam_size)
+
+        max_len: int = -1
+        if self.match_source_len:
+            max_len = src_lengths.max().item()
+        else:
+            max_len = min(
+                int(self.max_len_a * src_len + self.max_len_b),
+                self.max_len - 1,
+            )
+        assert (
+            self.min_len <= max_len
+        ), "min_len cannot be larger than max_len, please adjust these!"
+        # compute the encoder output for each beam
+        with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"):
+            encoder_outs = self.model.forward_encoder(net_input)
+
+        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
+        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
+        new_order = new_order.to(src_tokens.device).long()
+        encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order)
+        # ensure encoder_outs is a List.
+        assert encoder_outs is not None
+
+        # initialize buffers
+        scores = (
+            torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float()
+        )  # +1 for eos; pad is never chosen for scoring
+        tokens = (
+            torch.zeros(bsz * beam_size, max_len + 2)
+            .to(src_tokens)
+            .long()
+            .fill_(self.pad)
+        )  # +2 for eos and pad
+        tokens[:, 0] = self.eos if bos_token is None else bos_token
+        attn: Optional[Tensor] = None
+
+        # A list that indicates candidates that should be ignored.
+        # For example, suppose we're sampling and have already finalized 2/5
+        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
+        # so that we only finalize the remaining 3 samples.
+        cands_to_ignore = (
+            torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
+        )  # forward and backward-compatible False mask
+
+        # list of completed sentences
+        finalized = torch.jit.annotate(
+            List[List[Dict[str, Tensor]]],
+            [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)],
+        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step
+
+        # a boolean array indicating if the sentence at the index is finished or not
+        finished = [False for i in range(bsz)]
+        num_remaining_sent = bsz  # number of sentences remaining
+
+        # number of candidate hypos per step
+        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
+
+        # offset arrays for converting between different indexing schemes
+        bbsz_offsets = (
+            (torch.arange(0, bsz) * beam_size)
+            .unsqueeze(1)
+            .type_as(tokens)
+            .to(src_tokens.device)
+        )
+        cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device)
+
+        reorder_state: Optional[Tensor] = None
+        batch_idxs: Optional[Tensor] = None
+
+        original_batch_idxs: Optional[Tensor] = None
+        if "id" in sample and isinstance(sample["id"], Tensor):
+            original_batch_idxs = sample["id"]
+        else:
+            original_batch_idxs = torch.arange(0, bsz).type_as(tokens)
+
+        for step in range(max_len + 1):  # one extra step for EOS marker
+            # reorder decoder internal states based on the prev choice of beams
+            if reorder_state is not None:
+                if batch_idxs is not None:
+                    # update beam indices to take into account removed sentences
+                    corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(
+                        batch_idxs
+                    )
+                    reorder_state.view(-1, beam_size).add_(
+                        corr.unsqueeze(-1) * beam_size
+                    )
+                    original_batch_idxs = original_batch_idxs[batch_idxs]
+                self.model.reorder_incremental_state(incremental_states, reorder_state)
+                encoder_outs = self.model.reorder_encoder_out(
+                    encoder_outs, reorder_state
+                )
+            with torch.autograd.profiler.record_function(
+                "EnsembleModel: forward_decoder"
+            ):
+                lprobs, avg_attn_scores = self.model.forward_decoder(
+                    tokens[:, : step + 1],
+                    encoder_outs,
+                    incremental_states,
+                    self.temperature,
+                )
+
+            if self.lm_model is not None:
+                lm_out = self.lm_model(tokens[:, : step + 1])
+                probs = self.lm_model.get_normalized_probs(
+                    lm_out, log_probs=True, sample=None
+                )
+                probs = probs[:, -1, :] * self.lm_weight
+                lprobs += probs
+
+            lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)
+
+            lprobs[:, self.pad] = -math.inf  # never select pad
+            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
+
+            # handle max length constraint
+            if step >= max_len:
+                lprobs[:, : self.eos] = -math.inf
+                lprobs[:, self.eos + 1 :] = -math.inf
+
+            # handle prefix tokens (possibly with different lengths)
+            if (
+                prefix_tokens is not None
+                and step < prefix_tokens.size(1)
+                and step < max_len
+            ):
+                lprobs, tokens, scores = self._prefix_tokens(
+                    step, lprobs, scores, tokens, prefix_tokens, beam_size
+                )
+            else:
+                if step < self.min_len:
+                    # minimum length constraint (does not apply if using prefix_tokens)
+                    lprobs[:, self.eos] = -math.inf
+
+                if self.token_indices_to_suppress is not None:
+                    lprobs[:, self.token_indices_to_suppress] = -math.inf
+
+            # Record attention scores, only support avg_attn_scores is a Tensor
+            if avg_attn_scores is not None:
+                if attn is None:
+                    attn = torch.empty(
+                        bsz * beam_size, avg_attn_scores.size(1), max_len + 2
+                    ).to(scores)
+                attn[:, :, step + 1].copy_(avg_attn_scores)
+
+            scores = scores.type_as(lprobs)
+            eos_bbsz_idx = torch.empty(0).to(
+                tokens
+            )  # indices of hypothesis ending with eos (finished sentences)
+            eos_scores = torch.empty(0).to(
+                scores
+            )  # scores of hypothesis ending with eos (finished sentences)
+
+            if self.should_set_src_lengths:
+                self.search.set_src_lengths(src_lengths)
+
+            if self.repeat_ngram_blocker is not None:
+                lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step)
+
+            # Shape: (batch, cand_size)
+            cand_scores, cand_indices, cand_beams = self.search.step(
+                step,
+                lprobs.view(bsz, -1, self.vocab_size),
+                scores.view(bsz, beam_size, -1)[:, :, :step],
+                tokens[:, : step + 1],
+                original_batch_idxs,
+            )
+
+            # cand_bbsz_idx contains beam indices for the top candidate
+            # hypotheses, with a range of values: [0, bsz*beam_size),
+            # and dimensions: [bsz, cand_size]
+            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+
+            # finalize hypotheses that end in eos
+            # Shape of eos_mask: (batch size, beam size)
+            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf)
+            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask)
+
+            # only consider eos when it's among the top beam_size indices
+            # Now we know what beam item(s) to finish
+            # Shape: 1d list of absolute-numbered
+            eos_bbsz_idx = torch.masked_select(
+                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
+            )
+
+            finalized_sents: List[int] = []
+            if eos_bbsz_idx.numel() > 0:
+                eos_scores = torch.masked_select(
+                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
+                )
+
+                finalized_sents = self.finalize_hypos(
+                    step,
+                    eos_bbsz_idx,
+                    eos_scores,
+                    tokens,
+                    scores,
+                    finalized,
+                    finished,
+                    beam_size,
+                    attn,
+                    src_lengths,
+                    max_len,
+                )
+                num_remaining_sent -= len(finalized_sents)
+
+            assert num_remaining_sent >= 0
+            if num_remaining_sent == 0:
+                break
+            if self.search.stop_on_max_len and step >= max_len:
+                break
+            assert step < max_len, f"{step} < {max_len}"
+
+            # Remove finalized sentences (ones for which {beam_size}
+            # finished hypotheses have been generated) from the batch.
+            if len(finalized_sents) > 0:
+                new_bsz = bsz - len(finalized_sents)
+
+                # construct batch_idxs which holds indices of batches to keep for the next pass
+                batch_mask = torch.ones(
+                    bsz, dtype=torch.bool, device=cand_indices.device
+                )
+                batch_mask[finalized_sents] = False
+                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
+                batch_idxs = torch.arange(
+                    bsz, device=cand_indices.device
+                ).masked_select(batch_mask)
+
+                # Choose the subset of the hypothesized constraints that will continue
+                self.search.prune_sentences(batch_idxs)
+
+                eos_mask = eos_mask[batch_idxs]
+                cand_beams = cand_beams[batch_idxs]
+                bbsz_offsets.resize_(new_bsz, 1)
+                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
+                cand_scores = cand_scores[batch_idxs]
+                cand_indices = cand_indices[batch_idxs]
+
+                if prefix_tokens is not None:
+                    prefix_tokens = prefix_tokens[batch_idxs]
+                src_lengths = src_lengths[batch_idxs]
+                cands_to_ignore = cands_to_ignore[batch_idxs]
+
+                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
+                if attn is not None:
+                    attn = attn.view(bsz, -1)[batch_idxs].view(
+                        new_bsz * beam_size, attn.size(1), -1
+                    )
+                bsz = new_bsz
+            else:
+                batch_idxs = None
+
+            # Set active_mask so that values > cand_size indicate eos hypos
+            # and values < cand_size indicate candidate active hypos.
+            # After, the min values per row are the top candidate active hypos
+
+            # Rewrite the operator since the element wise or is not supported in torchscript.
+
+            eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size]))
+            active_mask = torch.add(
+                eos_mask.type_as(cand_offsets) * cand_size,
+                cand_offsets[: eos_mask.size(1)],
+            )
+
+            # get the top beam_size active hypotheses, which are just
+            # the hypos with the smallest values in active_mask.
+            # {active_hypos} indicates which {beam_size} hypotheses
+            # from the list of {2 * beam_size} candidates were
+            # selected. Shapes: (batch size, beam size)
+            new_cands_to_ignore, active_hypos = torch.topk(
+                active_mask, k=beam_size, dim=1, largest=False
+            )
+
+            # update cands_to_ignore to ignore any finalized hypos.
+            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
+            # Make sure there is at least one active item for each sentence in the batch.
+            assert (~cands_to_ignore).any(dim=1).all()
+
+            # update cands_to_ignore to ignore any finalized hypos
+
+            # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
+            # can be selected more than once).
+            active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
+            active_scores = torch.gather(cand_scores, dim=1, index=active_hypos)
+
+            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_scores = active_scores.view(-1)
+
+            # copy tokens and scores for active hypotheses
+
+            # Set the tokens for each beam (can select the same row more than once)
+            tokens[:, : step + 1] = torch.index_select(
+                tokens[:, : step + 1], dim=0, index=active_bbsz_idx
+            )
+            # Select the next token for each of them
+            tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(
+                cand_indices, dim=1, index=active_hypos
+            )
+            if step > 0:
+                scores[:, :step] = torch.index_select(
+                    scores[:, :step], dim=0, index=active_bbsz_idx
+                )
+            scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(
+                cand_scores, dim=1, index=active_hypos
+            )
+
+            # Update constraints based on which candidates were selected for the next beam
+            self.search.update_constraints(active_hypos)
+
+            # copy attention for active hypotheses
+            if attn is not None:
+                attn[:, :, : step + 2] = torch.index_select(
+                    attn[:, :, : step + 2], dim=0, index=active_bbsz_idx
+                )
+
+            # reorder incremental state in decoder
+            reorder_state = active_bbsz_idx
+
+        # sort by score descending
+        for sent in range(len(finalized)):
+            scores = torch.tensor(
+                [float(elem["score"].item()) for elem in finalized[sent]]
+            )
+            _, sorted_scores_indices = torch.sort(scores, descending=True)
+            finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices]
+            finalized[sent] = torch.jit.annotate(
+                List[Dict[str, Tensor]], finalized[sent]
+            )
+        return finalized
+
+    def _prefix_tokens(
+        self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int
+    ):
+        """Handle prefix tokens"""
+        prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
+        prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
+        prefix_mask = prefix_toks.ne(self.pad)
+        lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs)
+        lprobs[prefix_mask] = lprobs[prefix_mask].scatter(
+            -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask]
+        )
+        # if prefix includes eos, then we should make sure tokens and
+        # scores are the same across all beams
+        eos_mask = prefix_toks.eq(self.eos)
+        if eos_mask.any():
+            # validate that the first beam matches the prefix
+            first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[
+                :, 0, 1 : step + 1
+            ]
+            eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
+            target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
+            assert (first_beam == target_prefix).all()
+
+            # copy tokens, scores and lprobs from the first beam to all beams
+            tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size)
+            scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size)
+            lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size)
+        return lprobs, tokens, scores
+
+    def replicate_first_beam(self, tensor, mask, beam_size: int):
+        tensor = tensor.view(-1, beam_size, tensor.size(-1))
+        tensor[mask] = tensor[mask][:, :1, :]
+        return tensor.view(-1, tensor.size(-1))
+
+    def finalize_hypos(
+        self,
+        step: int,
+        bbsz_idx,
+        eos_scores,
+        tokens,
+        scores,
+        finalized: List[List[Dict[str, Tensor]]],
+        finished: List[bool],
+        beam_size: int,
+        attn: Optional[Tensor],
+        src_lengths,
+        max_len: int,
+    ):
+        """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
+        A sentence is finalized when {beam_size} finished items have been collected for it.
+
+        Returns number of sentences (not beam items) being finalized.
+        These will be removed from the batch and not processed further.
+        Args:
+            bbsz_idx (Tensor):
+        """
+        assert bbsz_idx.numel() == eos_scores.numel()
+
+        # clone relevant token and attention tensors.
+        # tokens is (batch * beam, max_len). So the index_select
+        # gets the newly EOS rows, then selects cols 1..{step + 2}
+        tokens_clone = tokens.index_select(0, bbsz_idx)[
+            :, 1 : step + 2
+        ]  # skip the first index, which is EOS
+
+        tokens_clone[:, step] = self.eos
+        attn_clone = (
+            attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2]
+            if attn is not None
+            else None
+        )
+
+        # compute scores per token position
+        pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1]
+        pos_scores[:, step] = eos_scores
+        # convert from cumulative to per-position scores
+        pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
+
+        # normalize sentence-level scores
+        if self.normalize_scores:
+            eos_scores /= (step + 1) ** self.len_penalty
+
+        # cum_unfin records which sentences in the batch are finished.
+        # It helps match indexing between (a) the original sentences
+        # in the batch and (b) the current, possibly-reduced set of
+        # sentences.
+        cum_unfin: List[int] = []
+        prev = 0
+        for f in finished:
+            if f:
+                prev += 1
+            else:
+                cum_unfin.append(prev)
+        cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx)
+
+        unfin_idx = torch.div(bbsz_idx, beam_size, rounding_mode="trunc")
+        sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx)
+
+        # Create a set of "{sent}{unfin_idx}", where
+        # "unfin_idx" is the index in the current (possibly reduced)
+        # list of sentences, and "sent" is the index in the original,
+        # unreduced batch
+        # For every finished beam item
+        # sentence index in the current (possibly reduced) batch
+        seen = (sent << 32) + unfin_idx
+        unique_seen: List[int] = torch.unique(seen).tolist()
+
+        if self.match_source_len:
+            condition = step > torch.index_select(src_lengths, 0, unfin_idx)
+            eos_scores = torch.where(condition, torch.tensor(-math.inf), eos_scores)
+        sent_list: List[int] = sent.tolist()
+        for i in range(bbsz_idx.size()[0]):
+            # An input sentence (among those in a batch) is finished when
+            # beam_size hypotheses have been collected for it
+            if len(finalized[sent_list[i]]) < beam_size:
+                if attn_clone is not None:
+                    # remove padding tokens from attn scores
+                    hypo_attn = attn_clone[i]
+                else:
+                    hypo_attn = torch.empty(0)
+
+                finalized[sent_list[i]].append(
+                    {
+                        "tokens": tokens_clone[i],
+                        "score": eos_scores[i],
+                        "attention": hypo_attn,  # src_len x tgt_len
+                        "alignment": torch.empty(0),
+                        "positional_scores": pos_scores[i],
+                    }
+                )
+
+        newly_finished: List[int] = []
+        for unique_s in unique_seen:
+            # check termination conditions for this sentence
+            unique_sent: int = unique_s >> 32
+            unique_unfin_idx: int = unique_s - (unique_sent << 32)
+
+            if not finished[unique_sent] and self.is_finished(
+                step, unique_unfin_idx, max_len, len(finalized[unique_sent]), beam_size
+            ):
+                finished[unique_sent] = True
+                newly_finished.append(unique_unfin_idx)
+
+        return newly_finished
+
+    def is_finished(
+        self,
+        step: int,
+        unfin_idx: int,
+        max_len: int,
+        finalized_sent_len: int,
+        beam_size: int,
+    ):
+        """
+        Check whether decoding for a sentence is finished, which
+        occurs when the list of finalized sentences has reached the
+        beam size, or when we reach the maximum length.
+        """
+        assert finalized_sent_len <= beam_size
+        if finalized_sent_len == beam_size or step == max_len:
+            return True
+        return False
+
+
+class EnsembleModel(nn.Module):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__()
+        self.models_size = len(models)
+        # method '__len__' is not supported in ModuleList for torch script
+        self.single_model = models[0]
+        self.models = nn.ModuleList(models)
+
+        self.has_incremental: bool = False
+        if all(
+            hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder)
+            for m in models
+        ):
+            self.has_incremental = True
+
+    def forward(self):
+        pass
+
+    def has_encoder(self):
+        return hasattr(self.single_model, "encoder")
+
+    def has_incremental_states(self):
+        return self.has_incremental
+
+    def max_decoder_positions(self):
+        return min(
+            [
+                m.max_decoder_positions()
+                for m in self.models
+                if hasattr(m, "max_decoder_positions")
+            ]
+            + [sys.maxsize]
+        )
+
+    def set_decoder_beam_size(self, beam_size):
+        """Set beam size for efficient beamable enc-dec attention."""
+        if beam_size > 1:
+            for model in self.models:
+                if hasattr(model, "set_beam_size"):
+                    model.set_beam_size(beam_size)
+
+    @torch.jit.export
+    def forward_encoder(self, net_input: Dict[str, Tensor]):
+        if not self.has_encoder():
+            return None
+        return [model.encoder.forward_torchscript(net_input) for model in self.models]
+
+    @torch.jit.export
+    def forward_decoder(
+        self,
+        tokens,
+        encoder_outs: List[Dict[str, List[Tensor]]],
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        temperature: float = 1.0,
+    ):
+        log_probs = []
+        avg_attn: Optional[Tensor] = None
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None
+        for i, model in enumerate(self.models):
+            if self.has_encoder():
+                encoder_out = encoder_outs[i]
+            # decode each model
+            if self.has_incremental_states():
+                decoder_out = model.decoder.forward(
+                    tokens,
+                    encoder_out=encoder_out,
+                    incremental_state=incremental_states[i],
+                )
+            else:
+                if hasattr(model, "decoder"):
+                    decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out)
+                else:
+                    decoder_out = model.forward(tokens)
+
+            attn: Optional[Tensor] = None
+            decoder_len = len(decoder_out)
+            if decoder_len > 1 and decoder_out[1] is not None:
+                if isinstance(decoder_out[1], Tensor):
+                    attn = decoder_out[1]
+                else:
+                    attn_holder = decoder_out[1]["attn"]
+                    if isinstance(attn_holder, Tensor):
+                        attn = attn_holder
+                    elif attn_holder is not None:
+                        attn = attn_holder[0]
+                if attn is not None:
+                    attn = attn[:, -1, :]
+
+            decoder_out_tuple = (
+                decoder_out[0][:, -1:, :].div_(temperature),
+                None if decoder_len <= 1 else decoder_out[1],
+            )
+            probs = model.get_normalized_probs(
+                decoder_out_tuple, log_probs=True, sample=None
+            )
+            probs = probs[:, -1, :]
+            if self.models_size == 1:
+                return probs, attn
+
+            log_probs.append(probs)
+            if attn is not None:
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+
+        avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log(
+            self.models_size
+        )
+
+        if avg_attn is not None:
+            avg_attn.div_(self.models_size)
+        return avg_probs, avg_attn
+
+    @torch.jit.export
+    def reorder_encoder_out(
+        self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order
+    ):
+        """
+        Reorder encoder output according to *new_order*.
+
+        Args:
+            encoder_out: output from the ``forward()`` method
+            new_order (LongTensor): desired order
+
+        Returns:
+            *encoder_out* rearranged according to *new_order*
+        """
+        new_outs: List[Dict[str, List[Tensor]]] = []
+        if not self.has_encoder():
+            return new_outs
+        for i, model in enumerate(self.models):
+            assert encoder_outs is not None
+            new_outs.append(
+                model.encoder.reorder_encoder_out(encoder_outs[i], new_order)
+            )
+        return new_outs
+
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
+        new_order,
+    ):
+        if not self.has_incremental_states():
+            return
+        for i, model in enumerate(self.models):
+            model.decoder.reorder_incremental_state_scripting(
+                incremental_states[i], new_order
+            )
+
+
+class SequenceGeneratorWithAlignment(SequenceGenerator):
+    def __init__(
+        self, models, tgt_dict, left_pad_target=False, print_alignment="hard", **kwargs
+    ):
+        """Generates translations of a given source sentence.
+
+        Produces alignments following "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+
+        Args:
+            left_pad_target (bool, optional): Whether or not the
+                hypothesis should be left padded or not when they are
+                teacher forced for generating alignments.
+        """
+        super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs)
+        self.left_pad_target = left_pad_target
+
+        if print_alignment == "hard":
+            self.extract_alignment = utils.extract_hard_alignment
+        elif print_alignment == "soft":
+            self.extract_alignment = utils.extract_soft_alignment
+
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        finalized = super()._generate(sample, **kwargs)
+
+        src_tokens = sample["net_input"]["src_tokens"]
+        bsz = src_tokens.shape[0]
+        beam_size = self.beam_size
+        (
+            src_tokens,
+            src_lengths,
+            prev_output_tokens,
+            tgt_tokens,
+        ) = self._prepare_batch_for_alignment(sample, finalized)
+        if any(getattr(m, "full_context_alignment", False) for m in self.model.models):
+            attn = self.model.forward_align(src_tokens, src_lengths, prev_output_tokens)
+        else:
+            attn = [
+                finalized[i // beam_size][i % beam_size]["attention"].transpose(1, 0)
+                for i in range(bsz * beam_size)
+            ]
+
+        if src_tokens.device != "cpu":
+            src_tokens = src_tokens.to("cpu")
+            tgt_tokens = tgt_tokens.to("cpu")
+            attn = [i.to("cpu") for i in attn]
+
+        # Process the attn matrix to extract hard alignments.
+        for i in range(bsz * beam_size):
+            alignment = self.extract_alignment(
+                attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos
+            )
+            finalized[i // beam_size][i % beam_size]["alignment"] = alignment
+        return finalized
+
+    def _prepare_batch_for_alignment(self, sample, hypothesis):
+        src_tokens = sample["net_input"]["src_tokens"]
+        bsz = src_tokens.shape[0]
+        src_tokens = (
+            src_tokens[:, None, :]
+            .expand(-1, self.beam_size, -1)
+            .contiguous()
+            .view(bsz * self.beam_size, -1)
+        )
+        src_lengths = sample["net_input"]["src_lengths"]
+        src_lengths = (
+            src_lengths[:, None]
+            .expand(-1, self.beam_size)
+            .contiguous()
+            .view(bsz * self.beam_size)
+        )
+        prev_output_tokens = data_utils.collate_tokens(
+            [beam["tokens"] for example in hypothesis for beam in example],
+            self.pad,
+            self.eos,
+            self.left_pad_target,
+            move_eos_to_beginning=True,
+        )
+        tgt_tokens = data_utils.collate_tokens(
+            [beam["tokens"] for example in hypothesis for beam in example],
+            self.pad,
+            self.eos,
+            self.left_pad_target,
+            move_eos_to_beginning=False,
+        )
+        return src_tokens, src_lengths, prev_output_tokens, tgt_tokens
+
+
+class EnsembleModelWithAlignment(EnsembleModel):
+    """A wrapper around an ensemble of models."""
+
+    def __init__(self, models):
+        super().__init__(models)
+
+    def forward_align(self, src_tokens, src_lengths, prev_output_tokens):
+        avg_attn = None
+        for model in self.models:
+            decoder_out = model(src_tokens, src_lengths, prev_output_tokens)
+            attn = decoder_out[1]["attn"][0]
+            if avg_attn is None:
+                avg_attn = attn
+            else:
+                avg_attn.add_(attn)
+        if len(self.models) > 1:
+            avg_attn.div_(len(self.models))
+        return avg_attn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
new file mode 100644
index 0000000000..411d4df444
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
@@ -0,0 +1,153 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+import torch
+from fairseq import utils
+
+
+class SequenceScorer(object):
+    """Scores the target for a given source sentence."""
+
+    def __init__(
+        self,
+        tgt_dict,
+        softmax_batch=None,
+        compute_alignment=False,
+        eos=None,
+        symbols_to_strip_from_output=None,
+    ):
+        self.pad = tgt_dict.pad()
+        self.eos = tgt_dict.eos() if eos is None else eos
+        self.softmax_batch = softmax_batch or sys.maxsize
+        assert self.softmax_batch > 0
+        self.compute_alignment = compute_alignment
+        self.symbols_to_strip_from_output = (
+            symbols_to_strip_from_output.union({self.eos})
+            if symbols_to_strip_from_output is not None
+            else {self.eos}
+        )
+
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        """Score a batch of translations."""
+        net_input = sample["net_input"]
+
+        def batch_for_softmax(dec_out, target):
+            # assumes decoder_out[0] is the only thing needed (may not be correct for future models!)
+            first, rest = dec_out[0], dec_out[1:]
+            bsz, tsz, dim = first.shape
+            if bsz * tsz < self.softmax_batch:
+                yield dec_out, target, True
+            else:
+                flat = first.contiguous().view(1, -1, dim)
+                flat_tgt = target.contiguous().view(flat.shape[:-1])
+                s = 0
+                while s < flat.size(1):
+                    e = s + self.softmax_batch
+                    yield (flat[:, s:e],) + rest, flat_tgt[:, s:e], False
+                    s = e
+
+        def gather_target_probs(probs, target):
+            probs = probs.gather(
+                dim=2,
+                index=target.unsqueeze(-1),
+            )
+            return probs
+
+        orig_target = sample["target"]
+
+        # compute scores for each model in the ensemble
+        avg_probs = None
+        avg_attn = None
+        for model in models:
+            model.eval()
+            decoder_out = model(**net_input)
+            attn = decoder_out[1] if len(decoder_out) > 1 else None
+            if type(attn) is dict:
+                attn = attn.get("attn", None)
+
+            batched = batch_for_softmax(decoder_out, orig_target)
+            probs, idx = None, 0
+            for bd, tgt, is_single in batched:
+                sample["target"] = tgt
+                curr_prob = model.get_normalized_probs(
+                    bd, log_probs=len(models) == 1, sample=sample
+                ).data
+                if is_single:
+                    probs = gather_target_probs(curr_prob, orig_target)
+                else:
+                    if probs is None:
+                        probs = curr_prob.new(orig_target.numel())
+                    step = curr_prob.size(0) * curr_prob.size(1)
+                    end = step + idx
+                    tgt_probs = gather_target_probs(
+                        curr_prob.view(tgt.shape + (curr_prob.size(-1),)), tgt
+                    )
+                    probs[idx:end] = tgt_probs.view(-1)
+                    idx = end
+                sample["target"] = orig_target
+
+            probs = probs.view(sample["target"].shape)
+
+            if avg_probs is None:
+                avg_probs = probs
+            else:
+                avg_probs.add_(probs)
+            if attn is not None:
+                if torch.is_tensor(attn):
+                    attn = attn.data
+                else:
+                    attn = attn[0]
+                if avg_attn is None:
+                    avg_attn = attn
+                else:
+                    avg_attn.add_(attn)
+        if len(models) > 1:
+            avg_probs.div_(len(models))
+            avg_probs.log_()
+            if avg_attn is not None:
+                avg_attn.div_(len(models))
+
+        bsz = avg_probs.size(0)
+        hypos = []
+        start_idxs = sample["start_indices"] if "start_indices" in sample else [0] * bsz
+        for i in range(bsz):
+            # remove padding from ref
+            ref = (
+                utils.strip_pad(sample["target"][i, start_idxs[i] :], self.pad)
+                if sample["target"] is not None
+                else None
+            )
+            tgt_len = ref.numel()
+            avg_probs_i = avg_probs[i][start_idxs[i] : start_idxs[i] + tgt_len]
+            score_i = avg_probs_i.sum() / tgt_len
+            if avg_attn is not None:
+                avg_attn_i = avg_attn[i]
+                if self.compute_alignment:
+                    alignment = utils.extract_hard_alignment(
+                        avg_attn_i,
+                        sample["net_input"]["src_tokens"][i],
+                        sample["target"][i],
+                        self.pad,
+                        self.eos,
+                    )
+                else:
+                    alignment = None
+            else:
+                avg_attn_i = alignment = None
+            hypos.append(
+                [
+                    {
+                        "tokens": ref,
+                        "score": score_i,
+                        "attention": avg_attn_i,
+                        "alignment": alignment,
+                        "positional_scores": avg_probs_i,
+                    }
+                ]
+            )
+        return hypos
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
new file mode 100644
index 0000000000..f2cc8b5e86
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
@@ -0,0 +1,427 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig
+
+
+class SpeechGenerator(object):
+    def __init__(self, model, vocoder, data_cfg: S2TDataConfig):
+        self.model = model
+        self.vocoder = vocoder
+        stats_npz_path = data_cfg.global_cmvn_stats_npz
+        self.gcmvn_stats = None
+        if stats_npz_path is not None:
+            self.gcmvn_stats = np.load(stats_npz_path)
+
+    def gcmvn_denormalize(self, x):
+        # x: B x T x C
+        if self.gcmvn_stats is None:
+            return x
+        mean = torch.from_numpy(self.gcmvn_stats["mean"]).to(x)
+        std = torch.from_numpy(self.gcmvn_stats["std"]).to(x)
+        assert len(x.shape) == 3 and mean.shape[0] == std.shape[0] == x.shape[2]
+        x = x * std.view(1, 1, -1).expand_as(x)
+        return x + mean.view(1, 1, -1).expand_as(x)
+
+    def get_waveform(self, feat):
+        # T x C -> T
+        return None if self.vocoder is None else self.vocoder(feat).squeeze(0)
+
+
+class AutoRegressiveSpeechGenerator(SpeechGenerator):
+    def __init__(
+        self,
+        model,
+        vocoder,
+        data_cfg,
+        max_iter: int = 6000,
+        eos_prob_threshold: float = 0.5,
+    ):
+        super().__init__(model, vocoder, data_cfg)
+        self.max_iter = max_iter
+        self.eos_prob_threshold = eos_prob_threshold
+
+    @torch.no_grad()
+    def generate(self, model, sample, has_targ=False, **kwargs):
+        model.eval()
+
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lengths = sample["net_input"]["src_lengths"]
+        bsz, src_len = src_tokens.size()[:2]
+        n_frames_per_step = model.decoder.n_frames_per_step
+        out_dim = model.decoder.out_dim
+        raw_dim = out_dim // n_frames_per_step
+
+        # initialize
+        encoder_out = model.forward_encoder(
+            src_tokens, src_lengths, speaker=sample["speaker"]
+        )
+        incremental_state = {}
+        feat, attn, eos_prob = [], [], []
+        finished = src_tokens.new_zeros((bsz,)).bool()
+        out_lens = src_lengths.new_zeros((bsz,)).long().fill_(self.max_iter)
+
+        prev_feat_out = encoder_out["encoder_out"][0].new_zeros(bsz, 1, out_dim)
+        for step in range(self.max_iter):
+            cur_out_lens = out_lens.clone()
+            cur_out_lens.masked_fill_(cur_out_lens.eq(self.max_iter), step + 1)
+            _, cur_eos_out, cur_extra = model.forward_decoder(
+                prev_feat_out,
+                encoder_out=encoder_out,
+                incremental_state=incremental_state,
+                target_lengths=cur_out_lens,
+                speaker=sample["speaker"],
+                **kwargs,
+            )
+            cur_eos_prob = torch.sigmoid(cur_eos_out).squeeze(2)
+            feat.append(cur_extra["feature_out"])
+            attn.append(cur_extra["attn"])
+            eos_prob.append(cur_eos_prob)
+
+            cur_finished = cur_eos_prob.squeeze(1) > self.eos_prob_threshold
+            out_lens.masked_fill_((~finished) & cur_finished, step + 1)
+            finished = finished | cur_finished
+            if finished.sum().item() == bsz:
+                break
+            prev_feat_out = cur_extra["feature_out"]
+
+        feat = torch.cat(feat, dim=1)
+        feat = model.decoder.postnet(feat) + feat
+        eos_prob = torch.cat(eos_prob, dim=1)
+        attn = torch.cat(attn, dim=2)
+        alignment = attn.max(dim=1)[1]
+
+        feat = feat.reshape(bsz, -1, raw_dim)
+        feat = self.gcmvn_denormalize(feat)
+
+        eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1)
+        attn = attn.repeat_interleave(n_frames_per_step, dim=2)
+        alignment = alignment.repeat_interleave(n_frames_per_step, dim=1)
+        out_lens = out_lens * n_frames_per_step
+
+        finalized = [
+            {
+                "feature": feat[b, :out_len],
+                "eos_prob": eos_prob[b, :out_len],
+                "attn": attn[b, :, :out_len],
+                "alignment": alignment[b, :out_len],
+                "waveform": self.get_waveform(feat[b, :out_len]),
+            }
+            for b, out_len in zip(range(bsz), out_lens)
+        ]
+
+        if has_targ:
+            assert sample["target"].size(-1) == out_dim
+            tgt_feats = sample["target"].view(bsz, -1, raw_dim)
+            tgt_feats = self.gcmvn_denormalize(tgt_feats)
+            tgt_lens = sample["target_lengths"] * n_frames_per_step
+            for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)):
+                finalized[b]["targ_feature"] = f[:l]
+                finalized[b]["targ_waveform"] = self.get_waveform(f[:l])
+        return finalized
+
+
+class MultiDecoderSpeechGenerator(SpeechGenerator):
+    def __init__(
+        self,
+        models,
+        args,
+        vocoder,
+        data_cfg,
+        tgt_dict_mt,
+        max_iter: int = 6000,
+        eos_prob_threshold: float = 0.5,
+        eos_mt=None,
+        symbols_to_strip_from_output=None,
+    ):
+        super().__init__(models[0], vocoder, data_cfg)
+        self.max_iter = max_iter
+        self.eos_prob_threshold = eos_prob_threshold
+
+        self.tgt_dict_mt = tgt_dict_mt
+        self.eos_mt = eos_mt
+
+        from examples.speech_to_speech.unity.sequence_generator import SequenceGenerator
+        from fairseq import search
+
+        self.text_generator = SequenceGenerator(
+            models,
+            tgt_dict_mt,
+            beam_size=max(1, getattr(args, "beam", 5)),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search.BeamSearch(tgt_dict_mt),
+            eos=eos_mt,
+            symbols_to_strip_from_output=symbols_to_strip_from_output,
+        )
+
+    @torch.no_grad()
+    def generate(self, model, sample, has_targ=False, **kwargs):
+        model.eval()
+
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lengths = sample["net_input"]["src_lengths"]
+        bsz, src_len = src_tokens.size()[:2]
+        n_frames_per_step = model.decoder.n_frames_per_step
+        out_dim = model.decoder.out_dim
+        raw_dim = out_dim // n_frames_per_step
+
+        # initialize
+        encoder_out = model.forward_encoder(
+            src_tokens, src_lengths, speaker=sample["speaker"]
+        )
+
+        prefix_tokens = None
+        constraints = None
+        bos_token = None
+
+        mt_decoder = getattr(model, f"{model.mt_task_name}_decoder")
+
+        # 1. MT decoder
+        finalized_mt = self.text_generator.generate_decoder(
+            [encoder_out],
+            src_tokens,
+            src_lengths,
+            sample,
+            prefix_tokens,
+            constraints,
+            bos_token,
+            aux_task_name=model.mt_task_name,
+        )
+
+        # extract decoder output corresponding to the best hypothesis
+        max_tgt_len = max([len(hypo[0]["tokens"]) for hypo in finalized_mt])
+        prev_output_tokens_mt = (
+            src_tokens.new_zeros(src_tokens.shape[0], max_tgt_len)
+            .fill_(mt_decoder.padding_idx)
+            .int()
+        )  # B x T
+        for i, hypo in enumerate(finalized_mt):
+            i_beam = 0
+            tmp = hypo[i_beam]["tokens"].int()  # hyp + eos
+            prev_output_tokens_mt[i, 0] = self.text_generator.eos
+            if tmp[-1] == self.text_generator.eos:
+                tmp = tmp[:-1]
+            prev_output_tokens_mt[i, 1 : len(tmp) + 1] = tmp
+
+            text = "".join([self.tgt_dict_mt[c] for c in tmp])
+            text = text.replace("_", " ")
+            text = text.replace("▁", " ")
+            text = text.replace("<unk>", " ")
+            text = text.replace("<s>", "")
+            text = text.replace("</s>", "")
+            if len(text) > 0 and text[0] == " ":
+                text = text[1:]
+            sample_id = sample["id"].tolist()[i]
+            print("{} (None-{})".format(text, sample_id))
+
+        mt_decoder_out = mt_decoder(
+            prev_output_tokens_mt,
+            encoder_out=encoder_out,
+            features_only=True,
+        )
+        x = mt_decoder_out[0].transpose(0, 1)
+
+        mt_decoder_padding_mask = None
+        if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any():
+            mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx)
+
+        # 2. TTS encoder
+        if getattr(model, "synthesizer_encoder", None) is not None:
+            synthesizer_encoder_out = model.synthesizer_encoder(
+                x,
+                mt_decoder_padding_mask,
+            )
+        else:
+            synthesizer_encoder_out = {
+                "encoder_out": [x],  # T x B x C
+                "encoder_padding_mask": [mt_decoder_padding_mask]
+                if mt_decoder_padding_mask is not None
+                else [],  # B x T
+                "encoder_embedding": [],
+                "encoder_states": [],
+                "src_tokens": [],
+                "src_lengths": [],
+            }
+
+        # 3. TTS decoder
+        incremental_state = {}
+        feat, attn, eos_prob = [], [], []
+        finished = src_tokens.new_zeros((bsz,)).bool()
+        out_lens = src_lengths.new_zeros((bsz,)).long().fill_(self.max_iter)
+
+        prev_feat_out = encoder_out["encoder_out"][0].new_zeros(bsz, 1, out_dim)
+        for step in range(self.max_iter):
+            cur_out_lens = out_lens.clone()
+            cur_out_lens.masked_fill_(cur_out_lens.eq(self.max_iter), step + 1)
+            _, cur_eos_out, cur_extra = model.forward_decoder(
+                prev_feat_out,
+                encoder_out=synthesizer_encoder_out,
+                incremental_state=incremental_state,
+                target_lengths=cur_out_lens,
+                speaker=sample["speaker"],
+                **kwargs,
+            )
+            cur_eos_prob = torch.sigmoid(cur_eos_out).squeeze(2)
+            feat.append(cur_extra["feature_out"])
+            attn.append(cur_extra["attn"])
+            eos_prob.append(cur_eos_prob)
+
+            cur_finished = cur_eos_prob.squeeze(1) > self.eos_prob_threshold
+            out_lens.masked_fill_((~finished) & cur_finished, step + 1)
+            finished = finished | cur_finished
+            if finished.sum().item() == bsz:
+                break
+            prev_feat_out = cur_extra["feature_out"]
+
+        feat = torch.cat(feat, dim=1)
+        feat = model.decoder.postnet(feat) + feat
+        eos_prob = torch.cat(eos_prob, dim=1)
+        attn = torch.cat(attn, dim=2)
+        alignment = attn.max(dim=1)[1]
+
+        feat = feat.reshape(bsz, -1, raw_dim)
+        feat = self.gcmvn_denormalize(feat)
+
+        eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1)
+        attn = attn.repeat_interleave(n_frames_per_step, dim=2)
+        alignment = alignment.repeat_interleave(n_frames_per_step, dim=1)
+        out_lens = out_lens * n_frames_per_step
+
+        finalized = [
+            {
+                "feature": feat[b, :out_len],
+                "eos_prob": eos_prob[b, :out_len],
+                "attn": attn[b, :, :out_len],
+                "alignment": alignment[b, :out_len],
+                "waveform": self.get_waveform(feat[b, :out_len]),
+            }
+            for b, out_len in zip(range(bsz), out_lens)
+        ]
+
+        if has_targ:
+            assert sample["target"].size(-1) == out_dim
+            tgt_feats = sample["target"].view(bsz, -1, raw_dim)
+            tgt_feats = self.gcmvn_denormalize(tgt_feats)
+            tgt_lens = sample["target_lengths"] * n_frames_per_step
+            for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)):
+                finalized[b]["targ_feature"] = f[:l]
+                finalized[b]["targ_waveform"] = self.get_waveform(f[:l])
+        return finalized
+
+
+class NonAutoregressiveSpeechGenerator(SpeechGenerator):
+    @torch.no_grad()
+    def generate(self, model, sample, has_targ=False, **kwargs):
+        model.eval()
+
+        bsz, max_src_len = sample["net_input"]["src_tokens"].size()
+        n_frames_per_step = model.encoder.n_frames_per_step
+        out_dim = model.encoder.out_dim
+        raw_dim = out_dim // n_frames_per_step
+
+        feat, feat_post, out_lens, log_dur_out, _, _ = model(
+            src_tokens=sample["net_input"]["src_tokens"],
+            src_lengths=sample["net_input"]["src_lengths"],
+            prev_output_tokens=sample["net_input"]["prev_output_tokens"],
+            incremental_state=None,
+            target_lengths=sample["target_lengths"],
+            speaker=sample["speaker"],
+        )
+        if feat_post is not None:
+            feat = feat_post
+
+        feat = feat.view(bsz, -1, raw_dim)
+        feat = self.gcmvn_denormalize(feat)
+
+        dur_out = torch.clamp(torch.round(torch.exp(log_dur_out) - 1).long(), min=0)
+
+        def get_dur_plot_data(d):
+            r = []
+            for i, dd in enumerate(d):
+                r += [i + 1] * dd.item()
+            return r
+
+        out_lens = out_lens * n_frames_per_step
+        finalized = [
+            {
+                "feature": feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim]),
+                "waveform": self.get_waveform(
+                    feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim])
+                ),
+                "attn": feat.new_tensor(get_dur_plot_data(dur_out[b])),
+            }
+            for b, l in zip(range(bsz), out_lens)
+        ]
+
+        if has_targ:
+            tgt_feats = sample["target"].view(bsz, -1, raw_dim)
+            tgt_feats = self.gcmvn_denormalize(tgt_feats)
+            tgt_lens = sample["target_lengths"] * n_frames_per_step
+            for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)):
+                finalized[b]["targ_feature"] = f[:l]
+                finalized[b]["targ_waveform"] = self.get_waveform(f[:l])
+        return finalized
+
+
+class TeacherForcingAutoRegressiveSpeechGenerator(AutoRegressiveSpeechGenerator):
+    @torch.no_grad()
+    def generate(self, model, sample, has_targ=False, **kwargs):
+        model.eval()
+
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lens = sample["net_input"]["src_lengths"]
+        prev_out_tokens = sample["net_input"]["prev_output_tokens"]
+        tgt_lens = sample["target_lengths"]
+        n_frames_per_step = model.decoder.n_frames_per_step
+        raw_dim = model.decoder.out_dim // n_frames_per_step
+        bsz = src_tokens.shape[0]
+
+        feat, eos_prob, extra = model(
+            src_tokens,
+            src_lens,
+            prev_out_tokens,
+            incremental_state=None,
+            target_lengths=tgt_lens,
+            speaker=sample["speaker"],
+        )
+
+        attn = extra["attn"]  # B x T_s x T_t
+        alignment = attn.max(dim=1)[1]
+        feat = feat.reshape(bsz, -1, raw_dim)
+        feat = self.gcmvn_denormalize(feat)
+        eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1)
+        attn = attn.repeat_interleave(n_frames_per_step, dim=2)
+        alignment = alignment.repeat_interleave(n_frames_per_step, dim=1)
+        tgt_lens = sample["target_lengths"] * n_frames_per_step
+
+        finalized = [
+            {
+                "feature": feat[b, :tgt_len],
+                "eos_prob": eos_prob[b, :tgt_len],
+                "attn": attn[b, :, :tgt_len],
+                "alignment": alignment[b, :tgt_len],
+                "waveform": self.get_waveform(feat[b, :tgt_len]),
+            }
+            for b, tgt_len in zip(range(bsz), tgt_lens)
+        ]
+
+        if has_targ:
+            tgt_feats = sample["target"].view(bsz, -1, raw_dim)
+            tgt_feats = self.gcmvn_denormalize(tgt_feats)
+            for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)):
+                finalized[b]["targ_feature"] = f[:l]
+                finalized[b]["targ_waveform"] = self.get_waveform(f[:l])
+        return finalized
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
new file mode 100644
index 0000000000..6da1f001f0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
@@ -0,0 +1,138 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import argparse
+import importlib
+import os
+
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import merge_with_parent
+from hydra.core.config_store import ConfigStore
+
+from .fairseq_task import FairseqTask, LegacyFairseqTask  # noqa
+
+
+# register dataclass
+TASK_DATACLASS_REGISTRY = {}
+TASK_REGISTRY = {}
+TASK_CLASS_NAMES = set()
+
+
+def setup_task(cfg: FairseqDataclass, **kwargs):
+    task = None
+    task_name = getattr(cfg, "task", None)
+
+    if isinstance(task_name, str):
+        # legacy tasks
+        task = TASK_REGISTRY[task_name]
+        if task_name in TASK_DATACLASS_REGISTRY:
+            dc = TASK_DATACLASS_REGISTRY[task_name]
+            cfg = dc.from_namespace(cfg)
+    else:
+        task_name = getattr(cfg, "_name", None)
+
+        if task_name and task_name in TASK_DATACLASS_REGISTRY:
+            remove_missing = "from_checkpoint" in kwargs and kwargs["from_checkpoint"]
+            dc = TASK_DATACLASS_REGISTRY[task_name]
+            cfg = merge_with_parent(dc(), cfg, remove_missing=remove_missing)
+            task = TASK_REGISTRY[task_name]
+
+    assert (
+        task is not None
+    ), f"Could not infer task type from {cfg}. Available argparse tasks: {TASK_REGISTRY.keys()}. Available hydra tasks: {TASK_DATACLASS_REGISTRY.keys()}"
+
+    return task.setup_task(cfg, **kwargs)
+
+
+def register_task(name, dataclass=None):
+    """
+    New tasks can be added to fairseq with the
+    :func:`~fairseq.tasks.register_task` function decorator.
+
+    For example::
+
+        @register_task('classification')
+        class ClassificationTask(FairseqTask):
+            (...)
+
+    .. note::
+
+        All Tasks must implement the :class:`~fairseq.tasks.FairseqTask`
+        interface.
+
+    Args:
+        name (str): the name of the task
+    """
+
+    def register_task_cls(cls):
+        if name in TASK_REGISTRY:
+            return TASK_REGISTRY[name]
+
+        if not issubclass(cls, FairseqTask):
+            raise ValueError(
+                "Task ({}: {}) must extend FairseqTask".format(name, cls.__name__)
+            )
+        if cls.__name__ in TASK_CLASS_NAMES:
+            raise ValueError(
+                "Cannot register task with duplicate class name ({})".format(
+                    cls.__name__
+                )
+            )
+        TASK_REGISTRY[name] = cls
+        TASK_CLASS_NAMES.add(cls.__name__)
+
+        if dataclass is not None and not issubclass(dataclass, FairseqDataclass):
+            raise ValueError(
+                "Dataclass {} must extend FairseqDataclass".format(dataclass)
+            )
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            TASK_DATACLASS_REGISTRY[name] = dataclass
+
+            cs = ConfigStore.instance()
+            node = dataclass()
+            node._name = name
+            cs.store(name=name, group="task", node=node, provider="fairseq")
+
+        return cls
+
+    return register_task_cls
+
+
+def get_task(name):
+    return TASK_REGISTRY[name]
+
+
+def import_tasks(tasks_dir, namespace):
+    for file in os.listdir(tasks_dir):
+        path = os.path.join(tasks_dir, file)
+        if (
+            not file.startswith("_")
+            and not file.startswith(".")
+            and (file.endswith(".py") or os.path.isdir(path))
+        ):
+            task_name = file[: file.find(".py")] if file.endswith(".py") else file
+            importlib.import_module(namespace + "." + task_name)
+
+            # expose `task_parser` for sphinx
+            if task_name in TASK_REGISTRY:
+                parser = argparse.ArgumentParser(add_help=False)
+                group_task = parser.add_argument_group("Task name")
+                # fmt: off
+                group_task.add_argument('--task', metavar=task_name,
+                                        help='Enable this task with: ``--task=' + task_name + '``')
+                # fmt: on
+                group_args = parser.add_argument_group(
+                    "Additional command-line arguments"
+                )
+                TASK_REGISTRY[task_name].add_args(group_args)
+                globals()[task_name + "_parser"] = parser
+
+
+# automatically import any Python files in the tasks/ directory
+tasks_dir = os.path.dirname(__file__)
+import_tasks(tasks_dir, "fairseq.tasks")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py
new file mode 100644
index 0000000000..77634f1bae
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import os
+import torch
+import json
+
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Optional, Any
+
+from fairseq.data import AddTargetDataset, Dictionary, encoders
+from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.configs import GenerationConfig
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+
+from . import register_task
+from .. import utils
+from ..logging import metrics
+
+
+logger = logging.getLogger(__name__)
+
+
+class LabelEncoder(object):
+    def __init__(self, dictionary):
+        self.dictionary = dictionary
+
+    def __call__(self, label):
+        return self.dictionary.encode_line(
+            label, append_eos=False, add_if_not_exist=False
+        )
+
+
+def label_len_fn(label):
+    return len(label.split(" "))
+
+
+@dataclass
+class AudioFinetuningConfig(AudioPretrainingConfig):
+    # Options for reporting WER metrics during validation. Only applicable to
+    # Seq2Seq models during fine-tuning
+    eval_wer: bool = field(
+        default=False, metadata={"help": "compute WER for Seq2Seq models"}
+    )
+    eval_wer_config: GenerationConfig = field(
+        default_factory=lambda: GenerationConfig(),
+        metadata={"help": "beam search config for evaluating wer during training"},
+    )
+    eval_wer_tokenizer: Any = field(
+        default=None,
+        metadata={"help": "tokenizer config for evaluating wer during training"},
+    )
+    eval_wer_post_process: str = field(
+        default="letter",
+        metadata={
+            "help": "remove BPE tokens before scoring (can be sentencepiece, letter, and more)"
+        },
+    )
+    eval_bleu: bool = field(
+        default=False, metadata={"help": "evaluation with BLEU scores"}
+    )
+    eval_bleu_detok: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "detokenize before computing BLEU (e.g., 'moses'); "
+            "required if using --eval-bleu; use 'space' to disable "
+            "detokenization; see fairseq.data.encoders for other options"
+        },
+    )
+    eval_bleu_detok_args: str = field(
+        default="{}", metadata={"help": "args for building the tokenizer, if needed"}
+    )
+    eval_tokenized_bleu: bool = field(
+        default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}
+    )
+    eval_bleu_remove_bpe: Optional[str] = field(
+        default=None, metadata={"help": "remove BPE before computing BLEU"}
+    )
+    eval_bleu_args: str = field(
+        default="{}",
+        metadata={
+            "help": "generation args for BLUE scoring, e.g., "
+            '\'{"beam": 4, "lenpen": 0.6}\''
+        },
+    )
+    eval_bleu_print_samples: bool = field(
+        default=False, metadata={"help": "print sample generations during validation"}
+    )
+    autoregressive: bool = field(
+        default=False,
+        metadata={
+            "help": "required for autoregressive decoders (like seq2seq models); "
+            "adds 'prev_output_tokens' to input and appends eos to target"
+        },
+    )
+    rebuild_batches: bool = True
+
+
+@register_task("audio_finetuning", dataclass=AudioFinetuningConfig)
+class AudioFinetuningTask(AudioPretrainingTask):
+    """ """
+
+    cfg: AudioFinetuningConfig
+
+    def __init__(
+        self,
+        cfg: AudioFinetuningConfig,
+    ):
+        super().__init__(cfg)
+        self.blank_symbol = "<s>"
+
+        self.state.add_factory("target_dictionary", self.load_target_dictionary)
+
+    def load_target_dictionary(self):
+        if self.cfg.labels:
+            dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt")
+            return Dictionary.load(dict_path)
+        return None
+
+    def load_dataset(
+        self, split: str, task_cfg: AudioFinetuningConfig = None, **kwargs
+    ):
+        super().load_dataset(split, task_cfg, **kwargs)
+
+        task_cfg = task_cfg or self.cfg
+        assert task_cfg.labels is not None
+        text_compression_level = getattr(
+            TextCompressionLevel, str(self.cfg.text_compression_level)
+        )
+        data_path = self.cfg.data
+        label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
+        skipped_indices = getattr(self.datasets[split], "skipped_indices", set())
+        text_compressor = TextCompressor(level=text_compression_level)
+        with open(label_path, "r") as f:
+            labels = [
+                text_compressor.compress(l)
+                for i, l in enumerate(f)
+                if i not in skipped_indices
+            ]
+
+        assert len(labels) == len(self.datasets[split]), (
+            f"labels length ({len(labels)}) and dataset length "
+            f"({len(self.datasets[split])}) do not match"
+        )
+
+        process_label = LabelEncoder(self.target_dictionary)
+
+        self.datasets[split] = AddTargetDataset(
+            self.datasets[split],
+            labels,
+            pad=self.target_dictionary.pad(),
+            eos=self.target_dictionary.eos(),
+            batch_targets=True,
+            process_label=process_label,
+            label_len_fn=label_len_fn,
+            add_to_input=task_cfg.get("autoregressive", False),
+            text_compression_level=text_compression_level,
+        )
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.state.target_dictionary
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        if self.cfg.eval_wer and self.cfg.autoregressive:
+            metrics = self._inference_with_wer(self.sequence_generator, sample, model)
+            logging_output["_num_char_errors"] = metrics["num_char_errors"]
+            logging_output["_num_chars"] = metrics["num_chars"]
+            logging_output["_num_word_errors"] = metrics["num_word_errors"]
+            logging_output["_num_words"] = metrics["num_words"]
+        if self.cfg.eval_bleu and self.cfg.autoregressive:
+            metrics = self._inference_with_bleu(self.sequence_generator, sample, model)
+            logging_output["_bleu_sys_len"] = metrics.sys_len
+            logging_output["_bleu_ref_len"] = metrics.ref_len
+            # we split counts into separate entries so that they can be
+            # summed efficiently across workers using fast-stat-sync
+            assert len(metrics.counts) == 4
+            for i in range(4):
+                logging_output[f"_bleu_counts_{i}"] = metrics.counts[i]
+                logging_output[f"_bleu_totals_{i}"] = metrics.totals[i]
+        return loss, sample_size, logging_output
+
+    def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False):
+        model = super().build_model(model_cfg, from_checkpoint)
+
+        if self.cfg.eval_wer and self.cfg.autoregressive:
+            self.sequence_generator = self.build_generator(
+                [model],
+                self.cfg.eval_wer_config,
+            )
+            if self.cfg.eval_wer_tokenizer:
+                self.tokenizer = encoders.build_tokenizer(self.cfg.eval_wer_tokenizer)
+            else:
+                self.tokenizer = None
+        if self.cfg.eval_bleu and self.cfg.autoregressive:
+            assert self.cfg.eval_bleu_detok is not None, (
+                "--eval-bleu-detok is required if using --eval-bleu; "
+                "try --eval-bleu-detok=moses (or --eval-bleu-detok=space "
+                "to disable detokenization, e.g., when using sentencepiece)"
+            )
+            detok_args = json.loads(self.cfg.eval_bleu_detok_args)
+            self.tokenizer = encoders.build_tokenizer(
+                Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args)
+            )
+            gen_args = json.loads(self.cfg.eval_bleu_args)
+            gen_args = Namespace(**gen_args)
+            self.sequence_generator = self.build_generator([model], gen_args)
+
+        return model
+
+    def _inference_with_wer(self, generator, sample, model):
+        import editdistance
+
+        def decode(toks):
+            s = self.target_dictionary.string(
+                toks.int().cpu(),
+                self.cfg.eval_wer_post_process,
+                escape_unk=True,
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        num_word_errors, num_char_errors = 0, 0
+        num_chars, num_words = 0, 0
+        gen_out = self.inference_step(generator, [model], sample, None)
+        for i in range(len(gen_out)):
+            hyp = decode(gen_out[i][0]["tokens"])
+            ref = decode(
+                utils.strip_pad(sample["target"][i], self.target_dictionary.pad()),
+            )
+            num_char_errors += editdistance.eval(hyp, ref)
+            num_chars += len(ref)
+            hyp_words = hyp.split()
+            ref_words = ref.split()
+            num_word_errors += editdistance.eval(hyp_words, ref_words)
+            num_words += len(ref_words)
+
+        return {
+            "num_char_errors": num_char_errors,
+            "num_chars": num_chars,
+            "num_word_errors": num_word_errors,
+            "num_words": num_words,
+        }
+
+    def _inference_with_bleu(self, generator, sample, model):
+        import sacrebleu
+
+        def decode(toks, is_ref):
+            s = self.target_dictionary.string(
+                toks.int().cpu(),
+                self.cfg.eval_bleu_remove_bpe,
+                # The default unknown string in fairseq is `<unk>`, but
+                # this is tokenized by sacrebleu as `< unk >`, inflating
+                # BLEU scores. Instead, we use a somewhat more verbose
+                # alternative that is unlikely to appear in the real
+                # reference, but doesn't get split into multiple tokens.
+                unk_string=("UNKNOWNTOKENINREF" if is_ref else "UNKNOWNTOKENINHYP"),
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        gen_out = self.inference_step(generator, [model], sample)
+        hyps, refs = [], []
+        for i in range(len(gen_out)):
+            hyps.append(decode(gen_out[i][0]["tokens"], is_ref=False))
+            refs.append(
+                decode(
+                    utils.strip_pad(sample["target"][i], self.target_dictionary.pad()),
+                    is_ref=True,  # don't count <unk> as matches to the hypo
+                )
+            )
+        if self.cfg.eval_bleu_print_samples:
+            logger.info("H-{} {}".format(sample["id"][0], hyps[0]))
+            logger.info("T-{} {}".format(sample["id"][0], refs[0]))
+
+        eval_tokenization = "none" if self.cfg.eval_tokenized_bleu else "13a"
+        return sacrebleu.corpus_bleu(hyps, [refs], tokenize=eval_tokenization)
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+
+        if self.cfg.eval_wer:
+            zero = torch.scalar_tensor(0.0)
+            num_char_errors = sum(
+                log.get("_num_char_errors", zero) for log in logging_outputs
+            )
+            num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs)
+            num_word_errors = sum(
+                log.get("_num_word_errors", zero) for log in logging_outputs
+            )
+            num_words = sum(log.get("_num_words", zero) for log in logging_outputs)
+            metrics.log_scalar("_num_char_errors", num_char_errors)
+            metrics.log_scalar("_num_chars", num_chars)
+            metrics.log_scalar("_num_word_errors", num_word_errors)
+            metrics.log_scalar("_num_words", num_words)
+            if num_chars > 0:
+                metrics.log_derived(
+                    "uer",
+                    lambda meters: meters["_num_char_errors"].sum
+                    * 100.0
+                    / meters["_num_chars"].sum
+                    if meters["_num_chars"].sum > 0
+                    else float("nan"),
+                )
+            if num_words > 0:
+                metrics.log_derived(
+                    "wer",
+                    lambda meters: meters["_num_word_errors"].sum
+                    * 100.0
+                    / meters["_num_words"].sum
+                    if meters["_num_words"].sum > 0
+                    else float("nan"),
+                )
+        if self.cfg.eval_bleu:
+            len_keys = ["_bleu_sys_len", "_bleu_ref_len"]
+            count_keys = [f"_bleu_counts_{i}" for i in range(4)]
+            total_keys = [f"_bleu_totals_{i}" for i in range(4)]
+            for k in len_keys + count_keys + total_keys:
+                metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs))
+
+            import sacrebleu
+
+            metrics.log_derived(
+                "bleu",
+                lambda meters: sacrebleu.compute_bleu(
+                    correct=[meters[k].sum for k in count_keys],
+                    total=[meters[k].sum for k in total_keys],
+                    sys_len=meters["_bleu_sys_len"].sum,
+                    ref_len=meters["_bleu_ref_len"].sum,
+                    smooth_method="exp",
+                ).score,
+            )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py
new file mode 100644
index 0000000000..e6de0666ad
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import os
+import sys
+
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import MISSING, II
+
+from fairseq.data import BinarizedAudioDataset, FileAudioDataset, SubsampleDataset
+from fairseq.dataclass import FairseqDataclass, ChoiceEnum
+from fairseq.data.text_compressor import TextCompressionLevel
+
+from . import FairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AudioMaskingConfig:
+    feature_encoder_spec: str = II("model.modalities.audio.feature_encoder_spec")
+    mask_prob: float = II("model.modalities.audio.mask_prob")
+    mask_prob_adjust: float = II("model.modalities.audio.mask_prob_adjust")
+    mask_length: int = II("model.modalities.audio.mask_length")
+    inverse_mask: bool = II("model.modalities.audio.inverse_mask")
+    mask_dropout: float = II("model.modalities.audio.mask_dropout")
+    clone_batch: int = II("model.clone_batch")
+    expand_adjacent: bool = False
+    non_overlapping: bool = False
+
+
+@dataclass
+class AudioPretrainingConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
+    labels: Optional[str] = field(
+        default=None,
+        metadata={"help": "extension of the label file to load, used for fine-tuning"},
+    )
+    binarized_dataset: bool = field(
+        default=False,
+        metadata={
+            "help": "if true, loads binarized dataset (useful for very large datasets). "
+            "See examples/wav2vec/scripts/binarize_manifest.sh"
+        },
+    )
+    sample_rate: int = field(
+        default=16_000,
+        metadata={
+            "help": "target sample rate. audio files will be up/down sampled to this rate"
+        },
+    )
+    normalize: bool = field(
+        default=False,
+        metadata={"help": "if set, normalizes input to have 0 mean and unit variance"},
+    )
+    enable_padding: bool = field(
+        default=False, metadata={"help": "pad shorter samples instead of cropping"}
+    )
+    max_sample_size: Optional[int] = field(
+        default=None, metadata={"help": "max sample size to crop to for batching"}
+    )
+    min_sample_size: Optional[int] = field(
+        default=None, metadata={"help": "min sample size to skip small examples"}
+    )
+    num_batch_buckets: int = field(
+        default=0,
+        metadata={"help": "number of buckets"},
+    )
+    tpu: bool = II("common.tpu")
+    text_compression_level: ChoiceEnum([x.name for x in TextCompressionLevel]) = field(
+        default="none",
+        metadata={
+            "help": "compression level for texts (e.g. audio filenames, "
+            "target texts): none/low/high (default: none). "
+        },
+    )
+
+    rebuild_batches: bool = True
+    precompute_mask_config: Optional[AudioMaskingConfig] = None
+
+    post_save_script: Optional[str] = None
+
+    subsample: float = 1
+    seed: int = II("common.seed")
+
+
+@register_task("audio_pretraining", dataclass=AudioPretrainingConfig)
+class AudioPretrainingTask(FairseqTask):
+    """ """
+
+    cfg: AudioPretrainingConfig
+
+    @classmethod
+    def setup_task(cls, cfg: AudioPretrainingConfig, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            cfg (AudioPretrainingConfig): configuration of this task
+        """
+
+        return cls(cfg)
+
+    def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs):
+        data_path = self.cfg.data
+        task_cfg = task_cfg or self.cfg
+
+        # upgrade old task
+        if isinstance(task_cfg, Namespace):
+            if not hasattr(task_cfg, "autoregressive"):
+                task_cfg.autoregressive = not task_cfg.criterion == "ctc"
+
+        text_compression_level = getattr(
+            TextCompressionLevel, str(self.cfg.text_compression_level)
+        )
+
+        compute_mask = task_cfg.precompute_mask_config is not None
+        mask_args = {}
+        if compute_mask:
+            mask_args = task_cfg.precompute_mask_config
+
+        if getattr(task_cfg, "binarized_dataset", False):
+            self.datasets[split] = BinarizedAudioDataset(
+                data_path,
+                split=split,
+                sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate),
+                max_sample_size=self.cfg.max_sample_size,
+                min_sample_size=self.cfg.min_sample_size,
+                pad=task_cfg.labels is not None or task_cfg.enable_padding,
+                normalize=task_cfg.normalize,
+                num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu),
+                compute_mask=compute_mask,
+                **mask_args,
+            )
+        else:
+            manifest_path = os.path.join(data_path, "{}.tsv".format(split))
+
+            self.datasets[split] = FileAudioDataset(
+                manifest_path=manifest_path,
+                sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate),
+                max_sample_size=self.cfg.max_sample_size,
+                min_sample_size=self.cfg.min_sample_size,
+                pad=task_cfg.labels is not None or task_cfg.enable_padding,
+                normalize=task_cfg.normalize,
+                num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu),
+                text_compression_level=text_compression_level,
+                compute_mask=compute_mask,
+                **mask_args,
+            )
+
+        if getattr(task_cfg, "subsample", 1) < 1:
+            self.datasets[split] = SubsampleDataset(
+                self.datasets[split],
+                task_cfg.subsample,
+                shuffle=True,
+                seed=task_cfg.seed,
+            )
+
+        if self.cfg.tpu and task_cfg.inferred_w2v_config.mask_channel_prob == 0.0:
+            logger.info(
+                "Pretraining on TPUs may suffer convergence "
+                "issues when training with `mask_channel_prob` value of "
+                "0. You may want to set this to a low value close to 0."
+            )
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return sys.maxsize, sys.maxsize
+
+    def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False):
+        model = super().build_model(model_cfg, from_checkpoint)
+
+        actualized_cfg = getattr(model, "cfg", None)
+        if actualized_cfg is not None:
+            # if "w2v_args" in actualized_cfg:
+            if hasattr(actualized_cfg, "w2v_args"):
+                model_cfg.w2v_args = actualized_cfg.w2v_args
+
+        return model
+
+    def post_save(self, cp_path, num_updates):
+        if self.cfg.post_save_script is not None:
+            logger.info(f"launching {self.cfg.post_save_script}")
+            import os.path as osp
+            from fairseq.file_io import PathManager
+
+            eval_cp_path = osp.join(
+                osp.dirname(cp_path), f"checkpoint_eval_{num_updates}.pt"
+            )
+
+            print(cp_path, eval_cp_path, osp.dirname(cp_path))
+
+            assert PathManager.copy(
+                cp_path, eval_cp_path, overwrite=True
+            ), f"Failed to copy {cp_path} to {eval_cp_path}"
+
+            import subprocess
+            import shlex
+
+            subprocess.call(shlex.split(f"{self.cfg.post_save_script} {eval_cp_path}"))
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
new file mode 100644
index 0000000000..8f8fe7e2de
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
@@ -0,0 +1,191 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import os
+from collections import OrderedDict
+
+import numpy as np
+from fairseq import tokenizer, utils
+from fairseq.data import ConcatDataset, Dictionary, TokenBlockDataset, data_utils
+from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("cross_lingual_lm")
+class CrossLingualLMTask(LegacyFairseqTask):
+    """
+    Task for training cross-lingual language models.
+
+    For more details look at: https://arxiv.org/pdf/1901.07291.pdf
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data",
+            help="colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner",
+        )
+        parser.add_argument(
+            "--tokens-per-sample",
+            default=512,
+            type=int,
+            help="max number of total tokens over all segments" " per sample",
+        )
+        parser.add_argument(
+            "--monolingual-langs",
+            default="en",
+            type=str,
+            help="comma separated list of languages for which we"
+            " want to train XLM on",
+        )
+        parser.add_argument(
+            "--shuffle",
+            action="store_true",
+            help="shuffle each monolingual dataset while" " training",
+        )
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+        self.distributed_world_size = args.distributed_world_size
+        self.langs2id = self._lang_to_id(args.monolingual_langs)
+
+    def _lang_to_id(self, languages: str):
+        """
+        Build a map from languages to ids. These ids are used as segment labels
+        for cross-lingual LM training.
+        """
+        lang2id = {}
+        langs = [l.strip() for l in languages.split(",")]
+        for id, lang in enumerate(langs):
+            lang2id[lang] = id
+        return lang2id
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        return MaskedLMDictionary.load(filename)
+
+    @classmethod
+    def build_dictionary(
+        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
+    ):
+        d = MaskedLMDictionary()
+        for filename in filenames:
+            Dictionary.add_file_to_dictionary(
+                filename, d, tokenizer.tokenize_line, workers
+            )
+        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
+        return d
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task."""
+        dictionary = MaskedLMDictionary.load(os.path.join(args.data, "dict.txt"))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def _load_single_lang_dataset(self, split, epoch):
+        loaded_datasets = []
+
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else "")
+            path = os.path.join(data_path, split_k)
+
+            ds = data_utils.load_indexed_dataset(
+                path, self.dictionary, self.args.dataset_impl
+            )
+            if ds is None:
+                if k > 0:
+                    break
+                else:
+                    raise FileNotFoundError(
+                        "Dataset not found: {} ({})".format(split, data_path)
+                    )
+
+            # Since we append each block with the classification_token,
+            # we need to effectively create blocks of length
+            # tokens_per_sample-1
+            loaded_datasets.append(
+                TokenBlockDataset(
+                    ds,
+                    ds.sizes,
+                    self.args.tokens_per_sample - 1,
+                    pad=self.dictionary.pad(),
+                    eos=self.dictionary.eos(),
+                )
+            )
+
+            logger.info(
+                "{} {} {} examples".format(data_path, split_k, len(loaded_datasets[-1]))
+            )
+
+        if len(loaded_datasets) == 1:
+            dataset = loaded_datasets[0]
+            sizes = dataset.sizes
+        else:
+            dataset = ConcatDataset(loaded_datasets)
+            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
+
+        return dataset, sizes
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        dataset_map = OrderedDict()
+
+        for lang in self.langs2id.keys():
+            # Datasets are expected to be in "split.lang" format (Eg: train.en)
+            language_split = "{}.{}".format(split, lang)
+
+            block_dataset, sizes = self._load_single_lang_dataset(
+                split=language_split, epoch=epoch
+            )
+
+            dataset_map[lang] = MaskedLMDataset(
+                dataset=block_dataset,
+                sizes=sizes,
+                vocab=self.dictionary,
+                pad_idx=self.dictionary.pad(),
+                mask_idx=self.dictionary.mask(),
+                classif_token_idx=self.dictionary.eos(),
+                sep_token_idx=self.dictionary.eos(),
+                shuffle=getattr(self.args, "shuffle", False),
+                has_pairs=False,
+                segment_id=self.langs2id[lang],
+                seed=self.seed,
+            )
+
+        self.datasets[split] = MultiCorpusSampledDataset(dataset_map)
+        logger.info(
+            "{} {} {} examples".format(
+                utils.split_paths(self.args.data)[epoch - 1],
+                split,
+                len(self.datasets[split]),
+            )
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
new file mode 100644
index 0000000000..57b824d581
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
@@ -0,0 +1,296 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+import numpy as np
+from omegaconf import II, MISSING
+
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    DenoisingDataset,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    PadDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+    data_utils,
+)
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+from ..data.indexed_dataset import get_available_dataset_impl
+
+logger = logging.getLogger(__name__)
+
+SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
+SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
+MASK_LENGTH_CHOICES = ChoiceEnum(["subword", "word", "span-poisson"])
+
+
+@dataclass
+class DenoisingConfig(FairseqDataclass):
+    data: str = field(
+        default=MISSING,
+        metadata={"help": "path to data directory"},
+    )
+    bpe: Optional[str] = field(
+        default=None,
+        metadata={"help": "TODO"},
+    )
+    tokens_per_sample: int = field(
+        default=512,
+        metadata={
+            "help": "max number of total tokens over all segments "
+            "per sample for dataset"
+        },
+    )
+    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
+        default="complete_doc",
+        metadata={
+            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.'
+        },
+    )
+    replace_length: int = field(
+        default=0,
+        metadata={"help": "TODO, should only allow -1, 0 and 1"},
+    )
+    mask: float = field(
+        default=0.0,
+        metadata={"help": "fraction of words/subwords that will be masked"},
+    )
+    mask_random: float = field(
+        default=0.0,
+        metadata={"help": "instead of using [MASK], use random token this often"},
+    )
+    insert: float = field(
+        default=0.0,
+        metadata={"help": "insert this percentage of additional random tokens"},
+    )
+    permute: float = field(
+        default=0.0,
+        metadata={"help": "take this proportion of subwords and permute them"},
+    )
+    rotate: float = field(
+        default=0.5,
+        metadata={"help": "rotate this proportion of inputs"},
+    )
+    poisson_lambda: float = field(
+        default=3.0,
+        metadata={"help": "randomly shuffle sentences for this proportion of inputs"},
+    )
+    shuffle_instance: float = field(
+        default=0.0,
+        metadata={"help": "shuffle this proportion of sentences in all inputs"},
+    )
+    mask_length: MASK_LENGTH_CHOICES = field(
+        default="subword",
+        metadata={"help": "mask length to choose"},
+    )
+    permute_sentences: int = field(
+        default=-1,
+        metadata={
+            "help": "when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)"
+        },
+    )
+    seed: int = II("common.seed")
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+    max_source_positions: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens in the source sequence"},
+    )
+    max_target_positions: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens in the target sequence"},
+    )
+    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
+        "dataset.dataset_impl"
+    )
+
+
+@register_task("denoising", dataclass=DenoisingConfig)
+class DenoisingTask(FairseqTask):
+    """
+    Denoising task for applying sequence to sequence denoising. (ie. BART)
+    """
+
+    cfg: DenoisingConfig
+
+    def __init__(self, cfg, dictionary):
+        super().__init__(cfg)
+        self.dictionary = dictionary
+
+        # add mask token
+        self.mask_idx = self.dictionary.add_symbol("<mask>")
+
+    @classmethod
+    def setup_task(cls, cfg: DenoisingConfig, **kwargs):
+        """Setup the task."""
+        paths = utils.split_paths(cfg.data)
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        if not hasattr(cfg, "shuffle_instance"):
+            cfg.shuffle_instance = False
+        return cls(cfg, dictionary)
+
+    def _load_dataset_split(self, split, epoch, combine):
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.dictionary,
+            self.cfg.dataset_impl,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+
+        dataset = StripTokenDataset(dataset, self.dictionary.eos())
+
+        dataset = maybe_shorten_dataset(
+            dataset,
+            split,
+            self.cfg.shorten_data_split_list,
+            self.cfg.shorten_method,
+            self.cfg.tokens_per_sample,
+            self.cfg.seed,
+        )
+
+        # create continuous blocks of tokens
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.cfg.tokens_per_sample - 2,
+            # one less for <s> and one for </s>
+            pad=self.dictionary.pad(),
+            eos=self.dictionary.eos(),
+            break_mode=self.cfg.sample_break_mode,
+            document_sep_len=0,
+        )
+        logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
+
+        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())
+        return dataset
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        dataset = self._load_dataset_split(split, epoch, combine)
+
+        mask_whole_words = (
+            get_whole_word_mask(self.cfg.bpe, self.source_dictionary)
+            if self.cfg.mask_length != "subword"
+            else None
+        )
+
+        self.datasets[split] = DenoisingDataset(
+            dataset,
+            dataset.sizes,
+            self.dictionary,
+            self.mask_idx,
+            mask_whole_words,
+            shuffle=self.cfg.shuffle_instance,
+            seed=self.cfg.seed,
+            mask=self.cfg.mask,
+            mask_random=self.cfg.mask_random,
+            insert=self.cfg.insert,
+            rotate=self.cfg.rotate,
+            permute_sentences=self.cfg.permute_sentences,
+            bpe=self.cfg.bpe,
+            replace_length=self.cfg.replace_length,
+            mask_length=self.cfg.mask_length,
+            poisson_lambda=self.cfg.poisson_lambda,
+        )
+        logger.info(
+            "Split: {0}, Loaded {1} samples of denoising_dataset".format(
+                split,
+                len(self.datasets[split]),
+            )
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
+        """
+        Generate batches for inference. We assume that the input begins with a
+        bos symbol (`<s>`) and ends with an eos symbol (`</s>`).
+        """
+        pad = self.source_dictionary.pad()
+        eos = self.source_dictionary.eos()
+        src_dataset = TokenBlockDataset(
+            src_tokens,
+            src_lengths,
+            block_size=self.cfg.tokens_per_sample - 2,  # for <s> and </s>
+            pad=pad,
+            eos=eos,
+            break_mode=self.cfg.sample_break_mode,
+            document_sep_len=0,
+        )
+        prev_output_tokens = PrependTokenDataset(
+            StripTokenDataset(src_dataset, eos), eos
+        )
+        src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False)
+        return NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": src_dataset,
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                    "prev_output_tokens": PadDataset(
+                        prev_output_tokens, pad_idx=pad, left_pad=False
+                    ),
+                },
+                "target": src_dataset,
+            },
+            sizes=[np.array(src_lengths)],
+        )
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.cfg.max_source_positions, self.cfg.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary`."""
+        return self.dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
new file mode 100644
index 0000000000..e39d1d6848
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
@@ -0,0 +1,708 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import warnings
+from argparse import Namespace
+from typing import Any, Callable, Dict, List
+
+import torch
+from fairseq import search, tokenizer, utils
+from fairseq.logging import metrics
+from fairseq.data import Dictionary, FairseqDataset, data_utils, encoders, iterators
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.optim.amp_optimizer import AMPOptimizer
+from omegaconf import DictConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class StatefulContainer(object):
+    def __init__(self):
+        self._state = dict()
+        self._factories = dict()
+
+    def add_factory(self, name, factory: Callable[[], Any]):
+        self._factories[name] = factory
+
+    def merge_state_dict(self, state_dict: Dict[str, Any]):
+        self._state.update(state_dict)
+
+    @property
+    def state_dict(self) -> Dict[str, Any]:
+        return self._state
+
+    def __getattr__(self, name):
+        if name not in self._state and name in self._factories:
+            self._state[name] = self._factories[name]()
+
+        if name in self._state:
+            return self._state[name]
+
+        raise AttributeError(f"Task state has no factory for attribute {name}")
+
+
+class FairseqTask(object):
+    """
+    Tasks store dictionaries and provide helpers for loading/iterating over
+    Datasets, initializing the Model/Criterion and calculating the loss.
+
+    Tasks have limited statefulness. In particular, state that needs to be
+    saved to/loaded from checkpoints needs to be stored in the `self.state`
+    :class:`StatefulContainer` object. For example::
+
+        self.state.add_factory("dictionary", self.load_dictionary)
+        print(self.state.dictionary)  # calls self.load_dictionary()
+
+    This is necessary so that when loading checkpoints, we can properly
+    recreate the task state after initializing the task instance.
+    """
+
+    @classmethod
+    def add_args(cls, parser):
+        """Add task-specific arguments to the parser."""
+        dc = getattr(cls, "__dataclass", None)
+        if dc is not None:
+            gen_parser_from_dataclass(parser, dc())
+
+    @staticmethod
+    def logging_outputs_can_be_summed(criterion) -> bool:
+        """
+        Whether the logging outputs returned by `train_step` and `valid_step` can
+        be summed across workers prior to calling `aggregate_logging_outputs`.
+        Setting this to True will improves distributed training speed.
+        """
+        return criterion.logging_outputs_can_be_summed()
+
+    def __init__(self, cfg: FairseqDataclass, **kwargs):
+        self.cfg = cfg
+        self.datasets = dict()
+        self.dataset_to_epoch_iter = dict()
+        self.state = StatefulContainer()
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        return Dictionary.load(filename)
+
+    @classmethod
+    def build_dictionary(
+        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
+    ):
+        """Build the dictionary
+
+        Args:
+            filenames (list): list of filenames
+            workers (int): number of concurrent workers
+            threshold (int): defines the minimum word count
+            nwords (int): defines the total number of words in the final dictionary,
+                including special symbols
+            padding_factor (int): can be used to pad the dictionary size to be a
+                multiple of 8, which is important on some hardware (e.g., Nvidia
+                Tensor Cores).
+        """
+        d = Dictionary()
+        for filename in filenames:
+            Dictionary.add_file_to_dictionary(
+                filename, d, tokenizer.tokenize_line, workers
+            )
+        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
+        return d
+
+    @classmethod
+    def setup_task(cls, cfg: DictConfig, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            cfg (omegaconf.DictConfig): parsed command-line arguments
+        """
+        return cls(cfg, **kwargs)
+
+    def has_sharded_data(self, split):
+        return os.pathsep in getattr(self.cfg, "data", "")
+
+    def load_dataset(
+        self,
+        split: str,
+        combine: bool = False,
+        task_cfg: FairseqDataclass = None,
+        **kwargs,
+    ):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+            combine (bool): combines a split segmented into pieces into one dataset
+            task_cfg (FairseqDataclass): optional task configuration stored in the checkpoint that can be used
+                                         to load datasets
+        """
+        raise NotImplementedError
+
+    def dataset(self, split):
+        """
+        Return a loaded dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+
+        Returns:
+            a :class:`~fairseq.data.FairseqDataset` corresponding to *split*
+        """
+        from fairseq.data import FairseqDataset
+
+        if split not in self.datasets:
+            raise KeyError("Dataset not loaded: " + split)
+        if not isinstance(self.datasets[split], FairseqDataset):
+            raise TypeError("Datasets are expected to be of type FairseqDataset")
+        return self.datasets[split]
+
+    def filter_indices_by_size(
+        self, indices, dataset, max_positions=None, ignore_invalid_inputs=False
+    ):
+        """
+        Filter examples that are too large
+
+        Args:
+            indices (np.array): original array of sample indices
+            dataset (~fairseq.data.FairseqDataset): dataset to batch
+            max_positions (optional): max sentence length supported by the
+                model (default: None).
+            ignore_invalid_inputs (bool, optional): don't raise Exception for
+                sentences that are too long (default: False).
+        Returns:
+            np.array: array of filtered sample indices
+        """
+        indices, ignored = dataset.filter_indices_by_size(indices, max_positions)
+        if len(ignored) > 0:
+            if not ignore_invalid_inputs:
+                raise Exception(
+                    (
+                        "Size of sample #{} is invalid (={}) since max_positions={}, "
+                        "skip this example with --skip-invalid-size-inputs-valid-test"
+                    ).format(ignored[0], dataset.size(ignored[0]), max_positions)
+                )
+            logger.warning(
+                (
+                    "{:,} samples have invalid sizes and will be skipped, "
+                    "max_positions={}, first few sample ids={}"
+                ).format(len(ignored), max_positions, ignored[:10])
+            )
+        return indices
+
+    def can_reuse_epoch_itr(self, dataset):
+        # We can reuse the epoch iterator across epochs as long as the dataset
+        # hasn't disabled it. We default to ``False`` here, although in practice
+        # this will be ``True`` for most datasets that inherit from
+        # ``FairseqDataset`` due to the base implementation there.
+        return getattr(dataset, "can_reuse_epoch_itr_across_epochs", False)
+
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+        data_buffer_size=0,
+        disable_iterator_cache=False,
+        skip_remainder_batch=False,
+        grouped_shuffling=False,
+        update_epoch_batch_itr=False,
+    ):
+        """
+        Get an iterator that yields batches of data from the given dataset.
+
+        Args:
+            dataset (~fairseq.data.FairseqDataset): dataset to batch
+            max_tokens (int, optional): max number of tokens in each batch
+                (default: None).
+            max_sentences (int, optional): max number of sentences in each
+                batch (default: None).
+            max_positions (optional): max sentence length supported by the
+                model (default: None).
+            ignore_invalid_inputs (bool, optional): don't raise Exception for
+                sentences that are too long (default: False).
+            required_batch_size_multiple (int, optional): require batch size to
+                be a multiple of N (default: 1).
+            seed (int, optional): seed for random number generator for
+                reproducibility (default: 1).
+            num_shards (int, optional): shard the data iterator into N
+                shards (default: 1).
+            shard_id (int, optional): which shard of the data iterator to
+                return (default: 0).
+            num_workers (int, optional): how many subprocesses to use for data
+                loading. 0 means the data will be loaded in the main process
+                (default: 0).
+            epoch (int, optional): the epoch to start the iterator from
+                (default: 1).
+            data_buffer_size (int, optional): number of batches to
+                preload (default: 0).
+            disable_iterator_cache (bool, optional): don't cache the
+                EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`)
+                (default: False).
+            skip_remainder_batch (bool, optional): if set, discard the last
+                batch in each training epoch, as the last batch is often smaller than
+                    local_batch_size * distributed_word_size (default: ``True``).
+            grouped_shuffling (bool, optional): group batches with each groups
+                containing num_shards batches and shuffle groups. Reduces difference
+                between sequence lengths among workers for batches sorted by length.
+            update_epoch_batch_itr (bool optional): if true then donot use the cached
+                batch iterator for the epoch
+
+        Returns:
+            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
+                given dataset split
+        """
+        can_reuse_epoch_itr = (
+            not disable_iterator_cache
+            and not update_epoch_batch_itr
+            and self.can_reuse_epoch_itr(dataset)
+        )
+        logger.info(f"can_reuse_epoch_itr = {can_reuse_epoch_itr}")
+        if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter:
+            logger.debug("reusing EpochBatchIterator for epoch {}".format(epoch))
+            return self.dataset_to_epoch_iter[dataset]
+
+        assert isinstance(dataset, FairseqDataset)
+
+        # initialize the dataset with the correct starting epoch
+        dataset.set_epoch(epoch)
+
+        def make_batches(dataset, epoch):
+            logger.info(f"creating new batches for epoch {epoch}")
+
+            # get indices ordered by example size
+            with data_utils.numpy_seed(seed + epoch):
+                indices = dataset.ordered_indices()
+
+            # filter examples that are too large
+            if max_positions is not None:
+                indices = self.filter_indices_by_size(
+                    indices, dataset, max_positions, ignore_invalid_inputs
+                )
+
+            # create mini-batches with given size constraints
+            batches = dataset.batch_by_size(
+                indices,
+                max_tokens=max_tokens,
+                max_sentences=max_sentences,
+                required_batch_size_multiple=required_batch_size_multiple,
+            )
+            return batches
+
+        reuse_dataloader = getattr(self.cfg, "reuse_dataloader", True)
+        persistent_workers = getattr(self.cfg, "persistent_workers", True)
+        rebuild_batches = getattr(self.cfg, "rebuild_batches", False)
+        logger.info(f"reuse_dataloader = {reuse_dataloader}")
+        logger.info(f"rebuild_batches = {rebuild_batches}")
+
+        if rebuild_batches:
+            logger.info("batches will be rebuilt for each epoch")
+            batch_sampler = make_batches
+        else:
+            batch_sampler = make_batches(dataset, epoch)
+
+        # return a reusable, sharded iterator
+        epoch_iter = iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_sampler=batch_sampler,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+            buffer_size=data_buffer_size,
+            skip_remainder_batch=skip_remainder_batch,
+            grouped_shuffling=grouped_shuffling,
+            reuse_dataloader=reuse_dataloader,
+            persistent_workers=persistent_workers,
+        )
+
+        if can_reuse_epoch_itr:
+            self.dataset_to_epoch_iter[dataset] = epoch_iter
+
+        return epoch_iter
+
+    def build_model(self, cfg: FairseqDataclass, from_checkpoint=False):
+        """
+        Build the :class:`~fairseq.models.BaseFairseqModel` instance for this
+        task.
+
+        Args:
+            cfg (FairseqDataclass): configuration object
+
+        Returns:
+            a :class:`~fairseq.models.BaseFairseqModel` instance
+        """
+        from fairseq import models, quantization_utils
+
+        model = models.build_model(cfg, self, from_checkpoint)
+        model = quantization_utils.quantize_model_scalar(model, cfg)
+        return model
+
+    def build_criterion(self, cfg: DictConfig, from_checkpoint=False):
+        """
+        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
+        this task.
+
+        Args:
+            cfg (omegaconf.DictConfig): configration object
+
+        Returns:
+            a :class:`~fairseq.criterions.FairseqCriterion` instance
+        """
+        from fairseq import criterions
+
+        return criterions.build_criterion(cfg, self, from_checkpoint=from_checkpoint)
+
+    def build_generator(
+        self,
+        models,
+        args,
+        seq_gen_cls=None,
+        extra_gen_cls_kwargs=None,
+        prefix_allowed_tokens_fn=None,
+    ):
+        """
+        Build a :class:`~fairseq.SequenceGenerator` instance for this
+        task.
+
+        Args:
+            models (List[~fairseq.models.FairseqModel]): ensemble of models
+            args (fairseq.dataclass.configs.GenerationConfig):
+                configuration object (dataclass) for generation
+            extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass
+                through to SequenceGenerator
+            prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]):
+                If provided, this function constrains the beam search to
+                allowed tokens only at each step. The provided function
+                should take 2 arguments: the batch ID (`batch_id: int`)
+                and a unidimensional tensor of token ids (`inputs_ids:
+                torch.Tensor`). It has to return a `List[int]` with the
+                allowed tokens for the next generation step conditioned
+                on the previously generated tokens (`inputs_ids`) and
+                the batch ID (`batch_id`). This argument is useful for
+                constrained generation conditioned on the prefix, as
+                described in "Autoregressive Entity Retrieval"
+                (https://arxiv.org/abs/2010.00904) and
+                https://github.com/facebookresearch/GENRE.
+        """
+        if getattr(args, "score_reference", False):
+            from fairseq.sequence_scorer import SequenceScorer
+
+            return SequenceScorer(
+                self.target_dictionary,
+                compute_alignment=getattr(args, "print_alignment", False),
+            )
+
+        from fairseq.sequence_generator import (
+            SequenceGenerator,
+            SequenceGeneratorWithAlignment,
+        )
+
+        # Choose search strategy. Defaults to Beam Search.
+        sampling = getattr(args, "sampling", False)
+        sampling_topk = getattr(args, "sampling_topk", -1)
+        sampling_topp = getattr(args, "sampling_topp", -1.0)
+        diverse_beam_groups = getattr(args, "diverse_beam_groups", -1)
+        diverse_beam_strength = getattr(args, "diverse_beam_strength", 0.5)
+        match_source_len = getattr(args, "match_source_len", False)
+        diversity_rate = getattr(args, "diversity_rate", -1)
+        constrained = getattr(args, "constraints", False)
+        if prefix_allowed_tokens_fn is None:
+            prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None)
+        if (
+            sum(
+                int(cond)
+                for cond in [
+                    sampling,
+                    diverse_beam_groups > 0,
+                    match_source_len,
+                    diversity_rate > 0,
+                ]
+            )
+            > 1
+        ):
+            raise ValueError("Provided Search parameters are mutually exclusive.")
+        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
+        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
+
+        if sampling:
+            search_strategy = search.Sampling(
+                self.target_dictionary, sampling_topk, sampling_topp
+            )
+        elif diverse_beam_groups > 0:
+            search_strategy = search.DiverseBeamSearch(
+                self.target_dictionary, diverse_beam_groups, diverse_beam_strength
+            )
+        elif match_source_len:
+            # this is useful for tagging applications where the output
+            # length should match the input length, so we hardcode the
+            # length constraints for simplicity
+            search_strategy = search.LengthConstrainedBeamSearch(
+                self.target_dictionary,
+                min_len_a=1,
+                min_len_b=0,
+                max_len_a=1,
+                max_len_b=0,
+            )
+        elif diversity_rate > -1:
+            search_strategy = search.DiverseSiblingsSearch(
+                self.target_dictionary, diversity_rate
+            )
+        elif constrained:
+            search_strategy = search.LexicallyConstrainedBeamSearch(
+                self.target_dictionary, args.constraints
+            )
+        elif prefix_allowed_tokens_fn:
+            search_strategy = search.PrefixConstrainedBeamSearch(
+                self.target_dictionary, prefix_allowed_tokens_fn
+            )
+        else:
+            search_strategy = search.BeamSearch(self.target_dictionary)
+
+        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+        if seq_gen_cls is None:
+            if getattr(args, "print_alignment", False):
+                seq_gen_cls = SequenceGeneratorWithAlignment
+                extra_gen_cls_kwargs["print_alignment"] = args.print_alignment
+            else:
+                seq_gen_cls = SequenceGenerator
+
+        return seq_gen_cls(
+            models,
+            self.target_dictionary,
+            beam_size=getattr(args, "beam", 5),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search_strategy,
+            **extra_gen_cls_kwargs,
+        )
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        """
+        Do forward and backward, and return the loss as computed by *criterion*
+        for the given *model* and *sample*.
+
+        Args:
+            sample (dict): the mini-batch. The format is defined by the
+                :class:`~fairseq.data.FairseqDataset`.
+            model (~fairseq.models.BaseFairseqModel): the model
+            criterion (~fairseq.criterions.FairseqCriterion): the criterion
+            optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
+            update_num (int): the current update
+            ignore_grad (bool): multiply loss by 0 if this is set to True
+
+        Returns:
+            tuple:
+                - the loss
+                - the sample size, which is used as the denominator for the
+                  gradient
+                - logging outputs to display while training
+        """
+        model.train()
+        model.set_num_updates(update_num)
+        with torch.autograd.profiler.record_function("forward"):
+            with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))):
+                loss, sample_size, logging_output = criterion(model, sample)
+        if ignore_grad:
+            loss *= 0
+        with torch.autograd.profiler.record_function("backward"):
+            optimizer.backward(loss)
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            loss, sample_size, logging_output = criterion(model, sample)
+        return loss, sample_size, logging_output
+
+    def optimizer_step(self, optimizer, model, update_num):
+        optimizer.step()
+
+    def build_dataset_for_inference(
+        self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs
+    ) -> torch.utils.data.Dataset:
+        raise NotImplementedError
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            return generator.generate(
+                models, sample, prefix_tokens=prefix_tokens, constraints=constraints
+            )
+
+    def begin_epoch(self, epoch, model):
+        """Hook function called before the start of each epoch."""
+        pass
+
+    def begin_valid_epoch(self, epoch, model):
+        """Hook function called before the start of each validation epoch."""
+        pass
+
+    def aggregate_logging_outputs(self, logging_outputs, criterion):
+        """[deprecated] Aggregate logging outputs from data parallel training."""
+        utils.deprecation_warning(
+            "The aggregate_logging_outputs API is deprecated. "
+            "Please use the reduce_metrics API instead."
+        )
+        with metrics.aggregate() as agg:
+            self.reduce_metrics(logging_outputs, criterion)
+            return agg.get_smoothed_values()
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        """Aggregate logging outputs from data parallel training."""
+        # backward compatibility for tasks that override aggregate_logging_outputs
+        base_func = FairseqTask.aggregate_logging_outputs
+        self_func = getattr(self, "aggregate_logging_outputs").__func__
+        if self_func is not base_func:
+            utils.deprecation_warning(
+                "Tasks should implement the reduce_metrics API. "
+                "Falling back to deprecated aggregate_logging_outputs API."
+            )
+            agg_logging_outputs = self.aggregate_logging_outputs(
+                logging_outputs, criterion
+            )
+            for k, v in agg_logging_outputs.items():
+                metrics.log_scalar(k, v)
+            return
+
+        if not any("ntokens" in log for log in logging_outputs):
+            warnings.warn(
+                "ntokens not found in Criterion logging outputs, cannot log wpb or wps"
+            )
+        else:
+            ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+            metrics.log_scalar("wpb", ntokens, priority=180, round=1)
+            metrics.log_speed("wps", ntokens, priority=90, round=1)
+
+        if not any("nsentences" in log for log in logging_outputs):
+            warnings.warn(
+                "nsentences not found in Criterion logging outputs, cannot log bsz"
+            )
+        else:
+            nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+            metrics.log_scalar("bsz", nsentences, priority=190, round=1)
+
+        criterion.__class__.reduce_metrics(logging_outputs)
+
+    def state_dict(self):
+        if self.state is not None:
+            return self.state.state_dict
+        return {}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        if self.state is not None:
+            self.state.merge_state_dict(state_dict)
+
+    def max_positions(self):
+        """Return the max input length allowed by the task."""
+        return None
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None
+
+    def build_tokenizer(self, args):
+        """Build the pre-tokenizer for this task."""
+        return encoders.build_tokenizer(args)
+
+    def build_bpe(self, args):
+        """Build the tokenizer for this task."""
+        return encoders.build_bpe(args)
+
+    def get_interactive_tokens_and_lengths(self, lines, encode_fn):
+        tokens = [
+            self.source_dictionary.encode_line(
+                encode_fn(src_str), add_if_not_exist=False
+            ).long()
+            for src_str in lines
+        ]
+        lengths = [t.numel() for t in tokens]
+        return tokens, lengths
+
+
+class LegacyFairseqTask(FairseqTask):
+    def __init__(self, args: Namespace):
+        super().__init__(None)
+        self.args = args
+        self.datasets = {}
+        self.dataset_to_epoch_iter = {}
+
+    @classmethod
+    def setup_task(cls, args: Namespace, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        return cls(args, **kwargs)
+
+    def has_sharded_data(self, split):
+        return os.pathsep in getattr(self.args, "data", "")
+
+    def build_model(self, args: Namespace, from_checkpoint=False):
+        """
+        Build the :class:`~fairseq.models.BaseFairseqModel` instance for this
+        task.
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+
+        Returns:
+            a :class:`~fairseq.models.BaseFairseqModel` instance
+        """
+        from fairseq import models, quantization_utils
+
+        model = models.build_model(args, self, from_checkpoint)
+        model = quantization_utils.quantize_model_scalar(model, args)
+        return model
+
+    def build_criterion(self, args: Namespace):
+        """
+        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
+        this task.
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+
+        Returns:
+            a :class:`~fairseq.criterions.FairseqCriterion` instance
+        """
+        from fairseq import criterions
+
+        return criterions.build_criterion(args, self)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
new file mode 100644
index 0000000000..667f5f8ee4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
@@ -0,0 +1,55 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from fairseq.data.audio.frm_text_to_speech_dataset import FrmTextToSpeechDatasetCreator
+from fairseq.tasks import register_task
+from fairseq.tasks.text_to_speech import TextToSpeechTask
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+@register_task("frm_text_to_speech")
+class FrmTextToSpeechTask(TextToSpeechTask):
+    @staticmethod
+    def add_args(parser):
+        TextToSpeechTask.add_args(parser)
+        parser.add_argument("--do_chunk", action="store_true", help="train on chunks")
+        parser.add_argument("--chunk_bound", default=-1, type=int)
+        parser.add_argument("--chunk_init", default=50, type=int)
+        parser.add_argument("--chunk_incr", default=5, type=int)
+        parser.add_argument("--add_eos", action="store_true")
+        parser.add_argument("--dedup", action="store_true")
+        parser.add_argument("--ref_fpu", default=-1, type=float)
+
+    def load_dataset(self, split, **unused_kwargs):
+        is_train_split = split.startswith("train")
+        pre_tokenizer = self.build_tokenizer(self.args)
+        bpe_tokenizer = self.build_bpe(self.args)
+        self.datasets[split] = FrmTextToSpeechDatasetCreator.from_tsv(
+            self.args.data,
+            self.data_cfg,
+            split,
+            self.src_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            is_train_split=is_train_split,
+            n_frames_per_step=self.args.n_frames_per_step,
+            speaker_to_id=self.speaker_to_id,
+            do_chunk=self.args.do_chunk,
+            chunk_bound=self.args.chunk_bound,
+            chunk_init=self.args.chunk_init,
+            chunk_incr=self.args.chunk_incr,
+            add_eos=self.args.add_eos,
+            dedup=self.args.dedup,
+            ref_fpu=self.args.ref_fpu,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py
new file mode 100644
index 0000000000..1a3605f14d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import os
+import sys
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+
+from dataclasses import dataclass, field
+from fairseq.data import Dictionary, HubertDataset
+from fairseq.dataclass.configs import FairseqDataclass
+from fairseq.tasks import register_task
+from fairseq.tasks.fairseq_task import FairseqTask
+from omegaconf import MISSING
+
+logger = logging.getLogger(__name__)
+
+
+class LabelEncoder(object):
+    def __init__(self, dictionary: Dictionary) -> None:
+        self.dictionary = dictionary
+
+    def __call__(self, label: str) -> List[str]:
+        return self.dictionary.encode_line(
+            label,
+            append_eos=False,
+            add_if_not_exist=False,
+        )
+
+
+@dataclass
+class HubertPretrainingConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
+    fine_tuning: bool = field(
+        default=False, metadata={"help": "set to true if fine-tuning Hubert"}
+    )
+    labels: List[str] = field(
+        default_factory=lambda: ["ltr"],
+        metadata={
+            "help": (
+                "extension of the label files to load, frame-level labels for"
+                " pre-training, and sequence-level label for fine-tuning"
+            )
+        },
+    )
+    label_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "if set, looks for labels in this directory instead",
+        },
+    )
+    label_rate: float = field(
+        default=-1.0,
+        metadata={"help": "label frame rate. -1.0 for sequence label"},
+    )
+    sample_rate: int = field(
+        default=16_000,
+        metadata={
+            "help": "target sample rate. audio files will be up/down "
+            "sampled to this rate"
+        },
+    )
+    normalize: bool = field(
+        default=False,
+        metadata={"help": "if set, normalizes input to have 0 mean and unit variance"},
+    )
+    enable_padding: bool = field(
+        default=False,
+        metadata={"help": "pad shorter samples instead of cropping"},
+    )
+    max_keep_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "exclude sample longer than this"},
+    )
+    max_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "max sample size to crop to for batching"},
+    )
+    min_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "min sample size to crop to for batching"},
+    )
+    single_target: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "if set, AddTargetDatasets outputs same keys " "as AddTargetDataset"
+        },
+    )
+    random_crop: Optional[bool] = field(
+        default=True,
+        metadata={"help": "always crop from the beginning if false"},
+    )
+    pad_audio: Optional[bool] = field(
+        default=False,
+        metadata={"help": "pad audio to the longest one in the batch if true"},
+    )
+
+
+@register_task("hubert_pretraining", dataclass=HubertPretrainingConfig)
+class HubertPretrainingTask(FairseqTask):
+
+    cfg: HubertPretrainingConfig
+
+    def __init__(
+        self,
+        cfg: HubertPretrainingConfig,
+    ) -> None:
+        super().__init__(cfg)
+
+        logger.info(f"current directory is {os.getcwd()}")
+        logger.info(f"HubertPretrainingTask Config {cfg}")
+
+        self.cfg = cfg
+        self.fine_tuning = cfg.fine_tuning
+
+        if cfg.fine_tuning:
+            self.state.add_factory("target_dictionary", self.load_dictionaries)
+        else:
+            self.state.add_factory("dictionaries", self.load_dictionaries)
+
+        self.blank_symbol = "<s>"
+
+    @property
+    def source_dictionary(self) -> Optional[Dictionary]:
+        return None
+
+    @property
+    def target_dictionary(self) -> Optional[Dictionary]:
+        return self.state.target_dictionary
+
+    @property
+    def dictionaries(self) -> List[Dictionary]:
+        return self.state.dictionaries
+
+    @classmethod
+    def setup_task(
+        cls, cfg: HubertPretrainingConfig, **kwargs
+    ) -> "HubertPretrainingTask":
+        return cls(cfg)
+
+    def load_dictionaries(self):
+        label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir
+        dictionaries = [
+            Dictionary.load(f"{label_dir}/dict.{label}.txt")
+            for label in self.cfg.labels
+        ]
+        return dictionaries[0] if self.cfg.fine_tuning else dictionaries
+
+    def get_label_dir(self) -> str:
+        if self.cfg.label_dir is None:
+            return self.cfg.data
+        return self.cfg.label_dir
+
+    def load_dataset(self, split: str, **kwargs) -> None:
+        manifest = f"{self.cfg.data}/{split}.tsv"
+        dicts = [self.target_dictionary] if self.cfg.fine_tuning else self.dictionaries
+        pad_list = [dict.pad() for dict in dicts]
+        eos_list = [dict.eos() for dict in dicts]
+        procs = [LabelEncoder(dict) for dict in dicts]
+        paths = [f"{self.get_label_dir()}/{split}.{l}" for l in self.cfg.labels]
+
+        # hubert v1: pad_audio=True, random_crop=False;
+        self.datasets[split] = HubertDataset(
+            manifest,
+            sample_rate=self.cfg.sample_rate,
+            label_paths=paths,
+            label_rates=self.cfg.label_rate,
+            pad_list=pad_list,
+            eos_list=eos_list,
+            label_processors=procs,
+            max_keep_sample_size=self.cfg.max_keep_size,
+            min_keep_sample_size=self.cfg.min_sample_size,
+            max_sample_size=self.cfg.max_sample_size,
+            pad_audio=self.cfg.pad_audio,
+            normalize=self.cfg.normalize,
+            store_labels=False,
+            random_crop=self.cfg.random_crop,
+            single_target=self.cfg.single_target,
+        )
+
+    def max_positions(self) -> Tuple[int, int]:
+        return (sys.maxsize, sys.maxsize)
+
+    def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array:
+        return indices
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
new file mode 100644
index 0000000000..44d5324b3d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
@@ -0,0 +1,383 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import torch
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    Dictionary,
+    IdDataset,
+    LMContextWindowDataset,
+    MonolingualDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    PadDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+    TruncatedDictionary,
+    data_utils,
+)
+from fairseq.data.indexed_dataset import get_available_dataset_impl
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.tasks import LegacyFairseqTask, register_task
+from omegaconf import II
+
+
+SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
+SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LanguageModelingConfig(FairseqDataclass):
+    data: Optional[str] = field(
+        default=None, metadata={"help": "path to data directory"}
+    )
+    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
+        default="none",
+        metadata={
+            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.'
+        },
+    )
+    tokens_per_sample: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens per sample for LM dataset"},
+    )
+    output_dictionary_size: int = field(
+        default=-1, metadata={"help": "limit the size of output dictionary"}
+    )
+    self_target: bool = field(default=False, metadata={"help": "include self target"})
+    future_target: bool = field(
+        default=False, metadata={"help": "include future target"}
+    )
+    past_target: bool = field(default=False, metadata={"help": "include past target"})
+    add_bos_token: bool = field(
+        default=False, metadata={"help": "prepend beginning of sentence token (<s>)"}
+    )
+    max_target_positions: Optional[int] = field(
+        default=None, metadata={"help": "max number of tokens in the target sequence"}
+    )
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+    pad_to_fixed_length: Optional[bool] = field(
+        default=False,
+        metadata={"help": "pad to fixed length"},
+    )
+    pad_to_fixed_bsz: Optional[bool] = field(
+        default=False,
+        metadata={"help": "boolean to pad to fixed batch size"},
+    )
+
+    # TODO common vars below add to parent
+    seed: int = II("common.seed")
+    batch_size: Optional[int] = II("dataset.batch_size")
+    batch_size_valid: Optional[int] = II("dataset.batch_size_valid")
+    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
+        "dataset.dataset_impl"
+    )
+    data_buffer_size: int = II("dataset.data_buffer_size")
+    tpu: bool = II("common.tpu")
+    use_plasma_view: bool = II("common.use_plasma_view")
+    plasma_path: str = II("common.plasma_path")
+
+
+@register_task("language_modeling", dataclass=LanguageModelingConfig)
+class LanguageModelingTask(LegacyFairseqTask):
+    """
+    Train a language model.
+
+    Args:
+        dictionary (~fairseq.data.Dictionary): the dictionary for the input of
+            the language model
+        output_dictionary (~fairseq.data.Dictionary): the dictionary for the
+            output of the language model. In most cases it will be the same as
+            *dictionary*, but could possibly be a more limited version of the
+            dictionary (if ``--output-dictionary-size`` is used).
+        targets (List[str]): list of the target types that the language model
+            should predict.  Can be one of "self", "future", and "past".
+            Defaults to "future".
+
+    .. note::
+
+        The language modeling task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate`, :mod:`fairseq-interactive` and
+        :mod:`fairseq-eval-lm`.
+
+    The language modeling task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.language_modeling_parser
+        :prog:
+    """
+
+    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.output_dictionary = output_dictionary or dictionary
+
+        if targets is None:
+            targets = ["future"]
+        self.targets = targets
+
+    @classmethod
+    def setup_dictionary(cls, args, **kwargs):
+        dictionary = None
+        output_dictionary = None
+        if args.data:
+            paths = utils.split_paths(args.data)
+            assert len(paths) > 0
+            dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+            logger.info("dictionary: {} types".format(len(dictionary)))
+            output_dictionary = dictionary
+            if args.output_dictionary_size >= 0:
+                output_dictionary = TruncatedDictionary(
+                    dictionary, args.output_dictionary_size
+                )
+        return (dictionary, output_dictionary)
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs)
+
+        # upgrade old checkpoints
+        if getattr(args, "exclude_self_target", False):
+            args.self_target = False
+
+        targets = []
+        if getattr(args, "self_target", False):
+            targets.append("self")
+        if getattr(args, "future_target", False):
+            targets.append("future")
+        if getattr(args, "past_target", False):
+            targets.append("past")
+        if len(targets) == 0:
+            # standard language modeling
+            targets = ["future"]
+
+        return cls(args, dictionary, output_dictionary, targets=targets)
+
+    def build_model(self, args, from_checkpoint=False):
+        model = super().build_model(args, from_checkpoint)
+        for target in self.targets:
+            if target not in model.supported_targets:
+                raise ValueError(
+                    "Unsupported language modeling target: {}".format(target)
+                )
+
+        return model
+
+    def load_dataset(
+        self, split: str, epoch=1, combine=False, **kwargs
+    ) -> MonolingualDataset:
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, valid1, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        # each process has its own copy of the raw data (likely to be an np.memmap)
+        dataset = data_utils.load_indexed_dataset(
+            split_path, self.dictionary, self.args.dataset_impl, combine=combine
+        )
+        if dataset is None:
+            raise FileNotFoundError(f"Dataset not found: {split} ({split_path})")
+
+        dataset = maybe_shorten_dataset(
+            dataset,
+            split,
+            self.args.shorten_data_split_list,
+            self.args.shorten_method,
+            self.args.tokens_per_sample,
+            self.args.seed,
+        )
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.args.tokens_per_sample,
+            pad=self.dictionary.pad(),
+            eos=self.dictionary.eos(),
+            break_mode=self.args.sample_break_mode,
+            include_targets=True,
+            use_plasma_view=self.args.use_plasma_view,
+            split_path=split_path,
+            plasma_path=self.args.plasma_path,
+        )
+
+        add_eos_for_other_targets = (
+            self.args.sample_break_mode is not None
+            and self.args.sample_break_mode != "none"
+        )
+        fixed_pad_length = None
+        if self.args.pad_to_fixed_length:
+            fixed_pad_length = self.args.tokens_per_sample
+
+        pad_to_bsz = None
+        if self.args.pad_to_fixed_bsz:
+            pad_to_bsz = (
+                self.args.batch_size_valid if "valid" in split else self.args.batch_size
+            )
+
+        self.datasets[split] = MonolingualDataset(
+            dataset=dataset,
+            sizes=dataset.sizes,
+            src_vocab=self.dictionary,
+            tgt_vocab=self.output_dictionary,
+            add_eos_for_other_targets=add_eos_for_other_targets,
+            shuffle=True,
+            targets=self.targets,
+            add_bos_token=self.args.add_bos_token,
+            fixed_pad_length=fixed_pad_length,
+            pad_to_bsz=pad_to_bsz,
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
+        """
+        Generate batches for inference. We prepend an eos token to src_tokens
+        (or bos if `--add-bos-token` is set) and we append a <pad> to target.
+        This is convenient both for generation with a prefix and LM scoring.
+        """
+        dataset = StripTokenDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                block_size=None,  # ignored for "eos" break mode
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode="eos",
+            ),
+            # remove eos from (end of) target sequence
+            self.source_dictionary.eos(),
+        )
+        src_dataset = PrependTokenDataset(
+            dataset,
+            token=(
+                self.source_dictionary.bos()
+                if getattr(self.args, "add_bos_token", False)
+                else self.source_dictionary.eos()
+            ),
+        )
+        tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad())
+        return NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": PadDataset(
+                        src_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                    ),
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                },
+                "target": PadDataset(
+                    tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False
+                ),
+            },
+            sizes=[np.array(src_lengths)],
+        )
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            # Generation will always be conditioned on bos_token
+            if getattr(self.args, "add_bos_token", False):
+                bos_token = self.source_dictionary.bos()
+            else:
+                bos_token = self.source_dictionary.eos()
+
+            if constraints is not None:
+                raise NotImplementedError(
+                    "Constrained decoding with the language_modeling task is not supported"
+                )
+
+            # SequenceGenerator doesn't use src_tokens directly, we need to
+            # pass the `prefix_tokens` argument instead
+            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
+                prefix_tokens = sample["net_input"]["src_tokens"]
+                if prefix_tokens[:, 0].eq(bos_token).all():
+                    prefix_tokens = prefix_tokens[:, 1:]
+
+            return generator.generate(
+                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
+            )
+
+    def eval_lm_dataloader(
+        self,
+        dataset,
+        max_tokens: Optional[int] = 36000,
+        batch_size: Optional[int] = None,
+        max_positions: Optional[int] = None,
+        num_shards: int = 1,
+        shard_id: int = 0,
+        num_workers: int = 1,
+        data_buffer_size: int = 10,
+        # ensures that every evaluated token has access to a context of at least
+        # this size, if possible
+        context_window: int = 0,
+    ):
+        if context_window > 0:
+            dataset = LMContextWindowDataset(
+                dataset=dataset,
+                tokens_per_sample=self.args.tokens_per_sample,
+                context_window=context_window,
+                pad_idx=self.source_dictionary.pad(),
+            )
+        return self.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=max_tokens,
+            max_sentences=batch_size,
+            max_positions=max_positions,
+            ignore_invalid_inputs=True,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            data_buffer_size=data_buffer_size,
+        ).next_epoch_itr(shuffle=False)
+
+    @property
+    def source_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.output_dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
new file mode 100644
index 0000000000..9754976549
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import logging
+import os
+
+import numpy as np
+from fairseq import tokenizer, utils
+from fairseq.data import ConcatDataset, Dictionary, data_utils, indexed_dataset
+from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
+from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
+from fairseq.data.legacy.masked_lm_dictionary import BertDictionary
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("legacy_masked_lm")
+class LegacyMaskedLMTask(LegacyFairseqTask):
+    """
+    Task for training Masked LM (BERT) model.
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data",
+            help="colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner",
+        )
+        parser.add_argument(
+            "--tokens-per-sample",
+            default=512,
+            type=int,
+            help="max number of total tokens over all segments"
+            " per sample for BERT dataset",
+        )
+        parser.add_argument(
+            "--break-mode", default="doc", type=str, help="mode for breaking sentence"
+        )
+        parser.add_argument("--shuffle-dataset", action="store_true", default=False)
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        return BertDictionary.load(filename)
+
+    @classmethod
+    def build_dictionary(
+        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
+    ):
+        d = BertDictionary()
+        for filename in filenames:
+            Dictionary.add_file_to_dictionary(
+                filename, d, tokenizer.tokenize_line, workers
+            )
+        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
+        return d
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task."""
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        dictionary = BertDictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+
+        return cls(args, dictionary)
+
+    def load_dataset(self, split, epoch=1, combine=False):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        loaded_datasets = []
+
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        logger.info("data_path", data_path)
+
+        for k in itertools.count():
+            split_k = split + (str(k) if k > 0 else "")
+            path = os.path.join(data_path, split_k)
+            ds = indexed_dataset.make_dataset(
+                path,
+                impl=self.args.dataset_impl,
+                fix_lua_indexing=True,
+                dictionary=self.dictionary,
+            )
+
+            if ds is None:
+                if k > 0:
+                    break
+                else:
+                    raise FileNotFoundError(
+                        "Dataset not found: {} ({})".format(split, data_path)
+                    )
+
+            with data_utils.numpy_seed(self.seed + k):
+                loaded_datasets.append(
+                    BlockPairDataset(
+                        ds,
+                        self.dictionary,
+                        ds.sizes,
+                        self.args.tokens_per_sample,
+                        break_mode=self.args.break_mode,
+                        doc_break_size=1,
+                    )
+                )
+
+            logger.info(
+                "{} {} {} examples".format(data_path, split_k, len(loaded_datasets[-1]))
+            )
+
+            if not combine:
+                break
+
+        if len(loaded_datasets) == 1:
+            dataset = loaded_datasets[0]
+            sizes = dataset.sizes
+        else:
+            dataset = ConcatDataset(loaded_datasets)
+            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
+
+        self.datasets[split] = MaskedLMDataset(
+            dataset=dataset,
+            sizes=sizes,
+            vocab=self.dictionary,
+            pad_idx=self.dictionary.pad(),
+            mask_idx=self.dictionary.mask(),
+            classif_token_idx=self.dictionary.cls(),
+            sep_token_idx=self.dictionary.sep(),
+            shuffle=self.args.shuffle_dataset,
+            seed=self.seed,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
new file mode 100644
index 0000000000..b064907a50
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
@@ -0,0 +1,327 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass, field
+
+import numpy as np
+from omegaconf import II, MISSING, OmegaConf
+
+from fairseq import utils
+from fairseq.data import (
+    Dictionary,
+    IdDataset,
+    MaskTokensDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PrependTokenDataset,
+    RightPadDataset,
+    RightPaddingMaskDataset,
+    SortDataset,
+    TokenBlockDataset,
+    data_utils,
+)
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.dataclass import FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+from .language_modeling import SAMPLE_BREAK_MODE_CHOICES, SHORTEN_METHOD_CHOICES
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MaskedLMConfig(FairseqDataclass):
+    data: str = field(
+        default=MISSING,
+        metadata={
+            "help": "colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner"
+        },
+    )
+    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
+        default="none",
+        metadata={
+            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.'
+        },
+    )
+    tokens_per_sample: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens per sample for LM dataset"},
+    )
+    mask_prob: float = field(
+        default=0.15,
+        metadata={"help": "probability of replacing a token with mask"},
+    )
+    leave_unmasked_prob: float = field(
+        default=0.1,
+        metadata={"help": "probability that a masked token is unmasked"},
+    )
+    random_token_prob: float = field(
+        default=0.1,
+        metadata={"help": "probability of replacing a token with a random token"},
+    )
+    freq_weighted_replacement: bool = field(
+        default=False,
+        metadata={"help": "sample random replacement words based on word frequencies"},
+    )
+    mask_whole_words: bool = field(
+        default=False,
+        metadata={"help": "mask whole words; you may also want to set --bpe"},
+    )
+    mask_multiple_length: int = field(
+        default=1,
+        metadata={"help": "repeat the mask indices multiple times"},
+    )
+    mask_stdev: float = field(
+        default=0.0,
+        metadata={"help": "stdev of the mask length"},
+    )
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+    seed: int = II("common.seed")
+
+    include_target_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "include target tokens in model input. this is used for data2vec"
+        },
+    )
+    include_index: bool = field(
+        default=True,
+        metadata={"help": "include index in model input. this is used for data2vec"},
+    )
+    skip_masking: bool = field(
+        default=False,
+        metadata={"help": "skip masking at dataset"},
+    )
+    # subsample_train: float = field(
+    #     default=1,
+    #     metadata={"help": "shorten training set for debugging"},
+    # )
+    d2v2_multi: bool = field(
+        default=False,
+        metadata={"help": "prepare dataset for data2vec_multi"},
+    )
+
+
+@register_task("masked_lm", dataclass=MaskedLMConfig)
+class MaskedLMTask(FairseqTask):
+
+    cfg: MaskedLMConfig
+
+    """Task for training masked language models (e.g., BERT, RoBERTa)."""
+
+    def __init__(self, cfg: MaskedLMConfig, dictionary=None):
+        super().__init__(cfg)
+        self.dictionary = dictionary or self.load_dict(cfg)
+
+        # add mask token
+        self.mask_idx = self.dictionary.add_symbol("<mask>")
+
+    @classmethod
+    def setup_task(cls, cfg: MaskedLMConfig, **kwargs):
+        dictionary = cls.load_dict(cfg)
+        return cls(cfg, dictionary)
+
+    @classmethod
+    def load_dict(cls, cfg):
+        paths = utils.split_paths(cfg.data)
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        return dictionary
+
+    def _load_dataset_split(self, split, epoch, combine):
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.source_dictionary,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+
+        dataset = maybe_shorten_dataset(
+            dataset,
+            split,
+            self.cfg.shorten_data_split_list,
+            self.cfg.shorten_method,
+            self.cfg.tokens_per_sample,
+            self.cfg.seed,
+        )
+
+        # create continuous blocks of tokens
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.cfg.tokens_per_sample - 1,  # one less for <s>
+            pad=self.source_dictionary.pad(),
+            eos=self.source_dictionary.eos(),
+            break_mode=self.cfg.sample_break_mode,
+        )
+        logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
+
+        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+        return PrependTokenDataset(dataset, self.source_dictionary.bos())
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        dataset = self._load_dataset_split(split, epoch, combine)
+
+        # create masked input and targets
+        mask_whole_words = (
+            get_whole_word_mask(self.args, self.source_dictionary)
+            if self.cfg.mask_whole_words
+            else None
+        )
+
+        src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
+            dataset,
+            self.source_dictionary,
+            pad_idx=self.source_dictionary.pad(),
+            mask_idx=self.mask_idx,
+            seed=self.cfg.seed,
+            mask_prob=self.cfg.mask_prob,
+            leave_unmasked_prob=self.cfg.leave_unmasked_prob,
+            random_token_prob=self.cfg.random_token_prob,
+            freq_weighted_replacement=self.cfg.freq_weighted_replacement,
+            mask_whole_words=mask_whole_words,
+            mask_multiple_length=self.cfg.mask_multiple_length,
+            mask_stdev=self.cfg.mask_stdev,
+            skip_masking=self.cfg.skip_masking,
+        )
+
+        with data_utils.numpy_seed(self.cfg.seed):
+            shuffle = np.random.permutation(len(src_dataset))
+
+        target_dataset = RightPadDataset(
+            tgt_dataset,
+            pad_idx=self.source_dictionary.pad(),
+        )
+
+        if self.cfg.d2v2_multi:
+            dataset = self._d2v2_multi_dataset(src_dataset)
+        else:
+            dataset = self._regular_dataset(src_dataset, target_dataset)
+
+        self.datasets[split] = SortDataset(
+            dataset, sort_order=[shuffle, src_dataset.sizes]
+        )
+
+    def _regular_dataset(self, src_dataset, target_dataset):
+        input_dict = {
+            "src_tokens": RightPadDataset(
+                src_dataset,
+                pad_idx=self.source_dictionary.pad(),
+            ),
+            "src_lengths": NumelDataset(src_dataset, reduce=False),
+        }
+        if self.cfg.include_target_tokens:
+            input_dict["target_tokens"] = target_dataset
+        if self.cfg.include_index:
+            input_dict["src_id"] = IdDataset()
+
+        dataset = NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": input_dict,
+                "target": target_dataset,
+                "nsentences": NumSamplesDataset(),
+                "ntokens": NumelDataset(src_dataset, reduce=True),
+            },
+            sizes=[src_dataset.sizes],
+        )
+        return dataset
+
+    def _d2v2_multi_dataset(self, src_dataset):
+        input_dict = {
+            "source": RightPadDataset(
+                src_dataset,
+                pad_idx=self.source_dictionary.pad(),
+            ),
+            "id": IdDataset(),
+            "padding_mask": RightPaddingMaskDataset(src_dataset),
+        }
+
+        dataset = NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": input_dict,
+                "nsentences": NumSamplesDataset(),
+                "ntokens": NumelDataset(src_dataset, reduce=True),
+            },
+            sizes=[src_dataset.sizes],
+        )
+        return dataset
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
+        src_dataset = RightPadDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                self.cfg.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode="eos",
+            ),
+            pad_idx=self.source_dictionary.pad(),
+        )
+        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
+        src_dataset = NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": src_dataset,
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                },
+            },
+            sizes=src_lengths,
+        )
+        if sort:
+            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
+        return src_dataset
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    def begin_epoch(self, epoch, model):
+        model.set_epoch(epoch)
+
+    def max_positions(self):
+        return self.cfg.tokens_per_sample
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
new file mode 100644
index 0000000000..cb5ee34554
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from omegaconf import II
+
+from fairseq.data import (
+    AppendTokenDataset,
+    ConcatDataset,
+    DenoisingDataset,
+    Dictionary,
+    PrependTokenDataset,
+    ResamplingDataset,
+    SortDataset,
+    TokenBlockDataset,
+    data_utils,
+)
+from fairseq.data.encoders.utils import get_whole_word_mask
+from fairseq.tasks import register_task
+
+from .denoising import DenoisingConfig, DenoisingTask
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MultilingualDenoisingConfig(DenoisingConfig):
+    multilang_sampling_alpha: float = field(
+        default=1.0,
+        metadata={"help": "smoothing alpha for sample ratios across multiple datasets"},
+    )
+    add_lang_token: bool = field(
+        default=False,
+        metadata={"help": ""},
+    )
+    langs: Optional[str] = field(
+        default=None,
+        metadata={"help": "language ids we are considering"},
+    )
+    no_whole_word_mask_langs: str = field(
+        default="",
+        metadata={
+            "help": "languages without spacing between words don't support whole word masking"
+        },
+    )
+    train_subset: str = II("common.train_subset")
+    valid_subset: str = II("common.valid_subset")
+
+
+@register_task("multilingual_denoising", dataclass=MultilingualDenoisingConfig)
+class MultilingualDenoisingTask(DenoisingTask):
+
+    cfg: MultilingualDenoisingConfig
+
+    @classmethod
+    def setup_task(cls, cfg: MultilingualDenoisingConfig, **kwargs):
+        """Setup the task."""
+        paths = cfg.data.split(":")
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+
+        data_path = paths[0]
+        if cfg.langs is None:
+            languages = sorted(
+                [
+                    name
+                    for name in os.listdir(data_path)
+                    if os.path.isdir(os.path.join(data_path, name))
+                ]
+            )
+        else:
+            languages = cfg.langs.split(",")
+
+        if cfg.add_lang_token:
+            for lang in languages:
+                dictionary.add_symbol("[{}]".format(lang))
+
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        if not hasattr(cfg, "shuffle_instance"):
+            cfg.shuffle_instance = False
+        return cls(cfg, dictionary)
+
+    def __init__(self, cfg: MultilingualDenoisingConfig, dictionary):
+        super().__init__(cfg, dictionary)
+        self.dictionary = dictionary
+
+        # add mask token
+        self.mask_idx = self.dictionary.add_symbol("<mask>")
+        self.cfg = cfg
+
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling probability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob**self.cfg.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = self.cfg.data.split(":")
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        if self.cfg.langs is None:
+            languages = sorted(
+                [
+                    name
+                    for name in os.listdir(data_path)
+                    if os.path.isdir(os.path.join(data_path, name))
+                ]
+            )
+        else:
+            languages = self.cfg.langs.split(",")
+            for name in languages:
+                p = os.path.join(data_path, name)
+                assert os.path.exists(p), "data not found: {}".format(p)
+
+        logger.info("Training on {0} languages: {1}".format(len(languages), languages))
+        logger.info(
+            "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)}
+        )
+
+        mask_whole_words = get_whole_word_mask(self.cfg.bpe, self.dictionary)
+        language_without_segmentations = self.cfg.no_whole_word_mask_langs.split(",")
+        lang_datasets = []
+        for language in languages:
+            split_path = os.path.join(data_path, language, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.cfg.dataset_impl,
+                combine=combine,
+            )
+            if dataset is None:
+                raise FileNotFoundError(
+                    "Dataset not found: {} ({})".format(split, split_path)
+                )
+
+            end_token = (
+                self.source_dictionary.index("[{}]".format(language))
+                if self.cfg.add_lang_token
+                else self.source_dictionary.eos()
+            )
+
+            # create continuous blocks of tokens
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.cfg.tokens_per_sample - 2,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=end_token,
+                break_mode=self.cfg.sample_break_mode,
+            )
+            logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
+
+            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+            dataset = AppendTokenDataset(dataset, end_token)
+
+            lang_mask_whole_words = (
+                mask_whole_words
+                if language not in language_without_segmentations
+                else None
+            )
+            lang_dataset = DenoisingDataset(
+                dataset,
+                dataset.sizes,
+                self.dictionary,
+                self.mask_idx,
+                lang_mask_whole_words,
+                shuffle=self.cfg.shuffle_instance,
+                seed=self.cfg.seed,
+                mask=self.cfg.mask,
+                mask_random=self.cfg.mask_random,
+                insert=self.cfg.insert,
+                rotate=self.cfg.rotate,
+                permute_sentences=self.cfg.permute_sentences,
+                bpe=self.cfg.bpe,
+                replace_length=self.cfg.replace_length,
+                mask_length=self.cfg.mask_length,
+                poisson_lambda=self.cfg.poisson_lambda,
+                eos=None
+                if not self.cfg.add_lang_token
+                else self.source_dictionary.index("[{}]".format(language)),
+            )
+            lang_datasets.append(lang_dataset)
+
+        dataset_lengths = np.array(
+            [len(d) for d in lang_datasets],
+            dtype=float,
+        )
+        logger.info(
+            "loaded total {} blocks for all languages".format(
+                int(dataset_lengths.sum()),
+            )
+        )
+        if split == self.cfg.train_subset:
+            # For train subset, additionally up or down sample languages.
+            sample_probs = self._get_sample_prob(dataset_lengths)
+            logger.info(
+                "Sample probability by language: {}".format(
+                    {
+                        lang: "{0:.4f}".format(sample_probs[id])
+                        for id, lang in enumerate(languages)
+                    }
+                )
+            )
+            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
+            logger.info(
+                "Up/Down Sampling ratio by language: {}".format(
+                    {
+                        lang: "{0:.2f}".format(size_ratio[id])
+                        for id, lang in enumerate(languages)
+                    }
+                )
+            )
+
+            resampled_lang_datasets = [
+                ResamplingDataset(
+                    lang_datasets[i],
+                    size_ratio=size_ratio[i],
+                    seed=self.cfg.seed,
+                    epoch=epoch,
+                    replace=size_ratio[i] >= 1.0,
+                )
+                for i, d in enumerate(lang_datasets)
+            ]
+            dataset = ConcatDataset(
+                resampled_lang_datasets,
+            )
+        else:
+            dataset = ConcatDataset(lang_datasets)
+            lang_splits = [split]
+            for lang_id, lang_dataset in enumerate(lang_datasets):
+                split_name = split + "_" + languages[lang_id]
+                lang_splits.append(split_name)
+                self.datasets[split_name] = lang_dataset
+
+            if split in self.cfg.valid_subset:
+                self.cfg.valid_subset = self.cfg.valid_subset.replace(
+                    split, ",".join(lang_splits)
+                )
+
+        with data_utils.numpy_seed(self.cfg.seed + epoch):
+            shuffle = np.random.permutation(len(dataset))
+
+        self.datasets[split] = SortDataset(
+            dataset,
+            sort_order=[
+                shuffle,
+                dataset.sizes,
+            ],
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
new file mode 100644
index 0000000000..8fd5e5954d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
@@ -0,0 +1,627 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import torch
+from omegaconf import II
+
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    ConcatDataset,
+    Dictionary,
+    IdDataset,
+    LMContextWindowDataset,
+    MonolingualDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    PadDataset,
+    PrependTokenDataset,
+    ResamplingDataset,
+    SortDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+    TruncatedDictionary,
+    data_utils,
+)
+from fairseq.data.indexed_dataset import get_available_dataset_impl
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
+SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
+logger = logging.getLogger(__name__)
+
+
+def lang_token(lang):
+    return f"<{lang}>"
+
+
+@dataclass
+class MultilingualLanguageModelingConfig(FairseqDataclass):
+    # TODO common var add to parent
+    data: Optional[str] = field(
+        default=None, metadata={"help": "path to data directory"}
+    )
+    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
+        default="none",
+        metadata={
+            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.'
+        },
+    )
+    tokens_per_sample: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens per sample for LM dataset"},
+    )
+    output_dictionary_size: int = field(
+        default=-1, metadata={"help": "limit the size of output dictionary"}
+    )
+    self_target: bool = field(default=False, metadata={"help": "include self target"})
+    future_target: bool = field(
+        default=False, metadata={"help": "include future target"}
+    )
+    past_target: bool = field(default=False, metadata={"help": "include past target"})
+    add_bos_token: bool = field(
+        default=False, metadata={"help": "prepend lang id token <dialect>"}
+    )
+    max_source_positions: Optional[int] = field(
+        default=None, metadata={"help": "max number of tokens in the source sequence"}
+    )
+    max_target_positions: Optional[int] = field(
+        default=None, metadata={"help": "max number of tokens in the target sequence"}
+    )
+    pad_to_fixed_length: Optional[bool] = field(
+        default=False, metadata={"help": "pad to fixed length"}
+    )
+    pad_to_fixed_bsz: Optional[bool] = field(
+        default=False, metadata={"help": "boolean to pad to fixed batch size"}
+    )
+
+    multilang_sampling_alpha: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": "smoothing alpha for sample rations across multiple datasets"
+        },
+    )
+
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+
+    langs: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of languages (default: all directories in data path)"
+        },
+    )
+    baseline_model_langs: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of languages in the baseline model (default: none)"
+        },
+    )
+    # TODO: legacy parameter kept for compatibility
+    baseline_model: str = field(
+        default="",
+        metadata={"help": "path to the baseline model (default: none)"},
+    )
+
+    lang_to_offline_shard_ratio: str = field(
+        default="",
+        metadata={
+            "help": "absolute path of tsv file location to indicate lang to offline shard ratio.",
+        },
+    )
+    # TODO common vars below add to parent
+    seed: int = II("common.seed")
+    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
+        "dataset.dataset_impl"
+    )
+    data_buffer_size: int = II("dataset.data_buffer_size")
+    tpu: bool = II("common.tpu")
+    batch_size: Optional[int] = II("dataset.batch_size")
+    batch_size_valid: Optional[int] = II("dataset.batch_size_valid")
+    train_subset: str = II("common.train_subset")
+    valid_subset: str = II("common.valid_subset")
+
+
+@register_task(
+    "multilingual_language_modeling", dataclass=MultilingualLanguageModelingConfig
+)
+class MultilingualLanguageModelingTask(LegacyFairseqTask):
+    """
+    Train a language model.
+
+    Args:
+        dictionary (~fairseq.data.Dictionary): the dictionary for the input of
+            the language model
+        output_dictionary (~fairseq.data.Dictionary): the dictionary for the
+            output of the language model. In most cases it will be the same as
+            *dictionary*, but could possibly be a more limited version of the
+            dictionary (if ``--output-dictionary-size`` is used).
+        targets (List[str]): list of the target types that the language model
+            should predict.  Can be one of "self", "future", and "past".
+            Defaults to "future".
+
+    .. note::
+
+        The language modeling task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate`, :mod:`fairseq-interactive` and
+        :mod:`fairseq-eval-lm`.
+
+    The language modeling task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.language_modeling_parser
+        :prog:
+    """
+
+    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.output_dictionary = output_dictionary or dictionary
+
+        if targets is None:
+            targets = ["future"]
+        self.targets = targets
+
+    @staticmethod
+    def _get_langs(args, epoch=1):
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        languages = sorted(
+            name
+            for name in os.listdir(data_path)
+            if os.path.isdir(os.path.join(data_path, name))
+        )
+        if args.langs:
+            keep_langs = set(args.langs.split(","))
+            languages = [lang for lang in languages if lang in keep_langs]
+            assert len(languages) == len(keep_langs)
+
+        return languages, data_path
+
+    @classmethod
+    def setup_dictionary(cls, args, **kwargs):
+        dictionary = None
+        output_dictionary = None
+        if args.data:
+            paths = utils.split_paths(args.data)
+            assert len(paths) > 0
+            dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+            if args.add_bos_token:
+                languages, _ = cls._get_langs(args)
+                logger.info("----------------")
+                for lang in languages:
+                    dictionary.add_symbol(lang_token(lang))
+                    logger.info(f"add language token: {lang_token(lang)}")
+                logger.info("----------------")
+
+            logger.info("dictionary: {} types".format(len(dictionary)))
+            output_dictionary = dictionary
+            if args.output_dictionary_size >= 0:
+                output_dictionary = TruncatedDictionary(
+                    dictionary, args.output_dictionary_size
+                )
+        return (dictionary, output_dictionary)
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs)
+
+        # upgrade old checkpoints
+        if hasattr(args, "exclude_self_target"):
+            args.self_target = not args.exclude_self_target
+
+        targets = []
+        if getattr(args, "self_target", False):
+            targets.append("self")
+        if getattr(args, "future_target", False):
+            targets.append("future")
+        if getattr(args, "past_target", False):
+            targets.append("past")
+        if len(targets) == 0:
+            # standard language modeling
+            targets = ["future"]
+
+        return cls(args, dictionary, output_dictionary, targets=targets)
+
+    def build_model(self, args, from_checkpoint=False):
+        model = super().build_model(args, from_checkpoint)
+        for target in self.targets:
+            if target not in model.supported_targets:
+                raise ValueError(
+                    f"Unsupported language modeling target: {target} not in {model.supported_targets}"
+                )
+
+        return model
+
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling porbability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob**self.args.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+
+    def load_dataset(self, split: str, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        languages, data_path = MultilingualLanguageModelingTask._get_langs(
+            self.args, epoch
+        )
+        lang_to_offline_shard_ratio = None
+        if self.args.lang_to_offline_shard_ratio != "":
+            lang_to_offline_shard_ratio = {}
+            assert os.path.exists(
+                self.args.lang_to_offline_shard_ratio
+            ), "provided offline shard ratio file doesn't exist: {0}".format(
+                self.args.lang_to_offline_shard_ratio
+            )
+            with open(self.args.lang_to_offline_shard_ratio) as fin:
+                for line in fin:
+                    lang, ratio = line.strip().split("\t")
+                    ratio = float(ratio)
+                    lang_to_offline_shard_ratio[lang] = ratio
+
+            logger.info(
+                "Found offline sharded ratio: %s",
+                lang_to_offline_shard_ratio,
+            )
+
+        if split == self.args.train_subset:
+            logger.info(
+                "Training on {0} languages: {1}".format(len(languages), languages)
+            )
+        else:
+            logger.info(
+                "Evaluating on {0} languages: {1}".format(len(languages), languages)
+            )
+
+        tokens_per_sample = self.args.tokens_per_sample - int(self.args.add_bos_token)
+
+        fixed_pad_length = None
+        if self.args.pad_to_fixed_length:
+            fixed_pad_length = self.args.tokens_per_sample
+
+        pad_to_bsz = None
+        if self.args.pad_to_fixed_bsz:
+            pad_to_bsz = (
+                self.args.batch_size_valid if "valid" in split else self.args.batch_size
+            )
+
+        lang_datasets = []
+        for lang_id, language in enumerate(languages):
+            split_path = os.path.join(data_path, language, split)
+            dataset = data_utils.load_indexed_dataset(
+                split_path, self.dictionary, self.args.dataset_impl, combine=combine
+            )
+            # print('len(dataset) =', len(dataset))
+            if dataset is None:
+                raise FileNotFoundError(
+                    "Dataset not found: {} ({})".format(split, split_path)
+                )
+
+            dataset = maybe_shorten_dataset(
+                dataset,
+                split,
+                self.args.shorten_data_split_list,
+                self.args.shorten_method,
+                tokens_per_sample,
+                self.args.seed,
+            )
+
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                tokens_per_sample,
+                pad=self.dictionary.pad(),
+                eos=self.dictionary.eos(),
+                break_mode=self.args.sample_break_mode,
+                include_targets=True,
+            )
+
+            add_eos_for_other_targets = (
+                self.args.sample_break_mode is not None
+                and self.args.sample_break_mode != "none"
+            )
+            src_lang_idx, tgt_lang_idx = None, None
+            if self.args.add_bos_token:
+                src_lang_idx = self.dictionary.index(lang_token(language))
+                tgt_lang_idx = self.output_dictionary.index(lang_token(language))
+
+            lang_datasets.append(
+                MonolingualDataset(
+                    dataset=dataset,
+                    sizes=dataset.sizes,
+                    src_vocab=self.dictionary,
+                    tgt_vocab=self.output_dictionary,
+                    add_eos_for_other_targets=add_eos_for_other_targets,
+                    shuffle=True,
+                    targets=self.targets,
+                    fixed_pad_length=fixed_pad_length,
+                    pad_to_bsz=pad_to_bsz,
+                    add_bos_token=self.args.add_bos_token,
+                    src_lang_idx=src_lang_idx,
+                    tgt_lang_idx=tgt_lang_idx,
+                )
+            )
+
+        dataset_lengths = np.array(
+            [len(d) for d in lang_datasets],
+            dtype=float,
+        )
+        logger.info(
+            "loaded total {} blocks for all languages".format(
+                dataset_lengths.sum(),
+            )
+        )
+        if split == self.args.train_subset:
+            dataset_lengths_ratio_multiplier = np.ones(len(dataset_lengths))
+            if lang_to_offline_shard_ratio is not None:
+                dataset_lengths_ratio_multiplier = []
+                for lang in languages:
+                    assert (
+                        lang in lang_to_offline_shard_ratio
+                    ), "Lang: {0} missing in offline shard ratio file: {1}".format(
+                        lang,
+                        self.args.lang_to_offline_shard_ratio,
+                    )
+                    dataset_lengths_ratio_multiplier.append(
+                        lang_to_offline_shard_ratio[lang]
+                    )
+                dataset_lengths_ratio_multiplier = np.array(
+                    dataset_lengths_ratio_multiplier
+                )
+                true_dataset_lengths = (
+                    dataset_lengths * dataset_lengths_ratio_multiplier
+                )
+            else:
+                true_dataset_lengths = dataset_lengths
+            # For train subset, additionally up or down sample languages.
+            sample_probs = self._get_sample_prob(true_dataset_lengths)
+
+            logger.info(
+                "Sample probability by language: %s",
+                {
+                    lang: "{0:.4f}".format(sample_probs[id])
+                    for id, lang in enumerate(languages)
+                },
+            )
+            size_ratio = (sample_probs * true_dataset_lengths.sum()) / dataset_lengths
+            # TODO: add an option for shrinking all size ratios to below 1
+            # if self.args.multilang_sampling_alpha != 1:
+            #   size_ratio /= size_ratio.max()
+
+            # Fix numeric errors in size ratio computation
+            #   0.999999999999999999 -> 1
+            #   1.000000000000000002 -> 1
+            for i in range(len(size_ratio)):
+                size_ratio[i] = round(size_ratio[i], 8)
+
+            logger.info(
+                "Up/Down Sampling ratio by language: %s",
+                {
+                    lang: "{0:.2f}".format(size_ratio[id])
+                    for id, lang in enumerate(languages)
+                },
+            )
+            logger.info(
+                "Actual dataset size by language: %s",
+                {
+                    lang: "{0:.2f}".format(len(lang_datasets[id]))
+                    for id, lang in enumerate(languages)
+                },
+            )
+            resampled_lang_datasets = [
+                ResamplingDataset(
+                    lang_datasets[i],
+                    size_ratio=size_ratio[i],
+                    seed=self.args.seed,
+                    epoch=epoch,
+                    replace=size_ratio[i] > 1.0,
+                )
+                for i, d in enumerate(lang_datasets)
+            ]
+            logger.info(
+                "Resampled dataset size by language: %s",
+                {
+                    lang: "{0:.2f}".format(len(resampled_lang_datasets[id]))
+                    for id, lang in enumerate(languages)
+                },
+            )
+            dataset = ConcatDataset(resampled_lang_datasets)
+        else:
+            dataset = ConcatDataset(lang_datasets)
+            lang_splits = [split]
+            for lang_id, lang_dataset in enumerate(lang_datasets):
+                split_name = split + "_" + languages[lang_id]
+                lang_splits.append(split_name)
+                self.datasets[split_name] = lang_dataset
+
+            # [TODO]: This is hacky for now to print validation ppl for each
+            # language individually. Maybe need task API changes to allow it
+            # in more generic ways.
+            if split in self.args.valid_subset:
+                self.args.valid_subset = self.args.valid_subset.replace(
+                    split, ",".join(lang_splits)
+                )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(dataset))
+
+        self.datasets[split] = SortDataset(
+            dataset,
+            sort_order=[
+                shuffle,
+                dataset.sizes,
+            ],
+        )
+
+    def build_dataset_for_inference(
+        self, src_tokens, src_lengths, language="en_XX", **kwargs
+    ):
+        """
+        Generate batches for inference. We prepend an eos token to src_tokens
+        (or bos if `--add-bos-token` is set) and we append a <pad> to target.
+        This is convenient both for generation with a prefix and LM scoring.
+        """
+        dataset = StripTokenDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                block_size=None,  # ignored for "eos" break mode
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode="eos",
+            ),
+            # remove eos from (end of) target sequence
+            self.source_dictionary.eos(),
+        )
+
+        src_lang_idx = self.dictionary.index(lang_token(language))
+        src_dataset = PrependTokenDataset(
+            dataset,
+            token=(
+                (src_lang_idx or self.source_dictionary.bos())
+                if getattr(self.args, "add_bos_token", False)
+                else self.source_dictionary.eos()
+            ),
+        )
+
+        max_seq_len = max(src_lengths) + 1
+        tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad())
+        return NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": PadDataset(
+                        src_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                        pad_length=max_seq_len,
+                    ),
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                },
+                "target": PadDataset(
+                    tgt_dataset,
+                    pad_idx=self.source_dictionary.pad(),
+                    left_pad=False,
+                    pad_length=max_seq_len,
+                ),
+            },
+            sizes=[np.array(src_lengths)],
+        )
+
+    @torch.no_grad()
+    def inference_step(
+        self,
+        generator,
+        models,
+        sample,
+        language="en_XX",
+        prefix_tokens=None,
+        constraints=None,
+    ):
+        # Generation will always be conditioned on bos_token
+        if getattr(self.args, "add_bos_token", False):
+            src_lang_idx = self.dictionary.index(lang_token(language))
+            bos_token = src_lang_idx or self.source_dictionary.bos()
+        else:
+            bos_token = self.source_dictionary.eos()
+
+        if constraints is not None:
+            raise NotImplementedError(
+                "Constrained decoding with the language_modeling task is not supported"
+            )
+
+        # SequenceGenerator doesn't use src_tokens directly, we need to
+        # pass the `prefix_tokens` argument instead
+        if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
+            prefix_tokens = sample["net_input"]["src_tokens"]
+            if prefix_tokens[:, 0].eq(bos_token).all():
+                prefix_tokens = prefix_tokens[:, 1:]
+
+        return generator.generate(
+            models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
+        )
+
+    def eval_lm_dataloader(
+        self,
+        dataset,
+        max_tokens: Optional[int] = 36000,
+        batch_size: Optional[int] = None,
+        max_positions: Optional[int] = None,
+        num_shards: int = 1,
+        shard_id: int = 0,
+        num_workers: int = 1,
+        data_buffer_size: int = 10,
+        # ensures that every evaluated token has access to a context of at least
+        # this size, if possible
+        context_window: int = 0,
+    ):
+        if context_window > 0:
+            dataset = LMContextWindowDataset(
+                dataset=dataset,
+                tokens_per_sample=self.args.tokens_per_sample,
+                context_window=context_window,
+                pad_idx=self.source_dictionary.pad(),
+            )
+        return self.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=max_tokens,
+            max_sentences=batch_size,
+            max_positions=max_positions,
+            ignore_invalid_inputs=True,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            data_buffer_size=data_buffer_size,
+        )
+
+    @property
+    def source_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.output_dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
new file mode 100644
index 0000000000..156d085aa4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
@@ -0,0 +1,338 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+import torch
+
+from fairseq import utils
+from fairseq.data import (
+    ConcatDataset,
+    Dictionary,
+    IdDataset,
+    MaskTokensDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    ResamplingDataset,
+    SortDataset,
+    TokenBlockDataset,
+    data_utils,
+    encoders,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("multilingual_masked_lm")
+class MultiLingualMaskedLMTask(LegacyFairseqTask):
+    """Task for training masked language models (e.g., BERT, RoBERTa)."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data",
+            help="colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner",
+        )
+        parser.add_argument(
+            "--sample-break-mode",
+            default="complete",
+            choices=["none", "complete", "complete_doc", "eos"],
+            help='If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.',
+        )
+        parser.add_argument(
+            "--tokens-per-sample",
+            default=512,
+            type=int,
+            help="max number of total tokens over all segments "
+            "per sample for BERT dataset",
+        )
+        parser.add_argument(
+            "--mask-prob",
+            default=0.15,
+            type=float,
+            help="probability of replacing a token with mask",
+        )
+        parser.add_argument(
+            "--leave-unmasked-prob",
+            default=0.1,
+            type=float,
+            help="probability that a masked token is unmasked",
+        )
+        parser.add_argument(
+            "--random-token-prob",
+            default=0.1,
+            type=float,
+            help="probability of replacing a token with a random token",
+        )
+        parser.add_argument(
+            "--freq-weighted-replacement",
+            action="store_true",
+            help="sample random replacement words based on word frequencies",
+        )
+        parser.add_argument(
+            "--mask-whole-words",
+            default=False,
+            action="store_true",
+            help="mask whole words; you may also want to set --bpe",
+        )
+        parser.add_argument(
+            "--multilang-sampling-alpha",
+            type=float,
+            default=1.0,
+            help="smoothing alpha for sample rations across multiple datasets",
+        )
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+        self.seed = args.seed
+
+        # add mask token
+        self.mask_idx = dictionary.add_symbol("<mask>")
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        return cls(args, dictionary)
+
+    def _get_whole_word_mask(self):
+        # create masked input and targets
+        if self.args.mask_whole_words:
+            bpe = encoders.build_bpe(self.args)
+            if bpe is not None:
+
+                def is_beginning_of_word(i):
+                    if i < self.source_dictionary.nspecial:
+                        # special elements are always considered beginnings
+                        return True
+                    tok = self.source_dictionary[i]
+                    if tok.startswith("madeupword"):
+                        return True
+                    try:
+                        return bpe.is_beginning_of_word(tok)
+                    except ValueError:
+                        return True
+
+                mask_whole_words = torch.ByteTensor(
+                    list(map(is_beginning_of_word, range(len(self.source_dictionary))))
+                )
+        else:
+            mask_whole_words = None
+        return mask_whole_words
+
+    def _get_sample_prob(self, dataset_lens):
+        """
+        Get smoothed sampling porbability by languages. This helps low resource
+        languages by upsampling them.
+        """
+        prob = dataset_lens / dataset_lens.sum()
+        smoothed_prob = prob**self.args.multilang_sampling_alpha
+        smoothed_prob = smoothed_prob / smoothed_prob.sum()
+        return smoothed_prob
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        languages = sorted(
+            name
+            for name in os.listdir(data_path)
+            if os.path.isdir(os.path.join(data_path, name))
+        )
+
+        logger.info("Training on {0} languages: {1}".format(len(languages), languages))
+        logger.info(
+            "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)}
+        )
+
+        mask_whole_words = self._get_whole_word_mask()
+        lang_datasets = []
+        for lang_id, language in enumerate(languages):
+            split_path = os.path.join(data_path, language, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            if dataset is None:
+                raise FileNotFoundError(
+                    "Dataset not found: {} ({})".format(split, split_path)
+                )
+
+            # create continuous blocks of tokens
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode=self.args.sample_break_mode,
+            )
+            logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
+
+            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+
+            src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
+                dataset,
+                self.source_dictionary,
+                pad_idx=self.source_dictionary.pad(),
+                mask_idx=self.mask_idx,
+                seed=self.args.seed,
+                mask_prob=self.args.mask_prob,
+                leave_unmasked_prob=self.args.leave_unmasked_prob,
+                random_token_prob=self.args.random_token_prob,
+                freq_weighted_replacement=self.args.freq_weighted_replacement,
+                mask_whole_words=mask_whole_words,
+            )
+
+            lang_dataset = NestedDictionaryDataset(
+                {
+                    "net_input": {
+                        "src_tokens": PadDataset(
+                            src_dataset,
+                            pad_idx=self.source_dictionary.pad(),
+                            left_pad=False,
+                        ),
+                        "src_lengths": NumelDataset(src_dataset, reduce=False),
+                    },
+                    "target": PadDataset(
+                        tgt_dataset,
+                        pad_idx=self.source_dictionary.pad(),
+                        left_pad=False,
+                    ),
+                    "nsentences": NumSamplesDataset(),
+                    "ntokens": NumelDataset(src_dataset, reduce=True),
+                    "lang_id": RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]),
+                },
+                sizes=[src_dataset.sizes],
+            )
+            lang_datasets.append(lang_dataset)
+
+        dataset_lengths = np.array(
+            [len(d) for d in lang_datasets],
+            dtype=float,
+        )
+        logger.info(
+            "loaded total {} blocks for all languages".format(
+                dataset_lengths.sum(),
+            )
+        )
+        if split == self.args.train_subset:
+            # For train subset, additionally up or down sample languages.
+            sample_probs = self._get_sample_prob(dataset_lengths)
+            logger.info(
+                "Sample probability by language: ",
+                {
+                    lang: "{0:.4f}".format(sample_probs[id])
+                    for id, lang in enumerate(languages)
+                },
+            )
+            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
+            logger.info(
+                "Up/Down Sampling ratio by language: ",
+                {
+                    lang: "{0:.2f}".format(size_ratio[id])
+                    for id, lang in enumerate(languages)
+                },
+            )
+
+            resampled_lang_datasets = [
+                ResamplingDataset(
+                    lang_datasets[i],
+                    size_ratio=size_ratio[i],
+                    seed=self.args.seed,
+                    epoch=epoch,
+                    replace=size_ratio[i] >= 1.0,
+                )
+                for i, d in enumerate(lang_datasets)
+            ]
+            dataset = ConcatDataset(resampled_lang_datasets)
+        else:
+            dataset = ConcatDataset(lang_datasets)
+            lang_splits = [split]
+            for lang_id, lang_dataset in enumerate(lang_datasets):
+                split_name = split + "_" + languages[lang_id]
+                lang_splits.append(split_name)
+                self.datasets[split_name] = lang_dataset
+
+            # [TODO]: This is hacky for now to print validation ppl for each
+            # language individually. Maybe need task API changes to allow it
+            # in more generic ways.
+            if split in self.args.valid_subset:
+                self.args.valid_subset = self.args.valid_subset.replace(
+                    split, ",".join(lang_splits)
+                )
+
+        with data_utils.numpy_seed(self.args.seed + epoch):
+            shuffle = np.random.permutation(len(dataset))
+
+        self.datasets[split] = SortDataset(
+            dataset,
+            sort_order=[
+                shuffle,
+                dataset.sizes,
+            ],
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
+        src_dataset = PadDataset(
+            TokenBlockDataset(
+                src_tokens,
+                src_lengths,
+                self.args.tokens_per_sample - 1,  # one less for <s>
+                pad=self.source_dictionary.pad(),
+                eos=self.source_dictionary.eos(),
+                break_mode="eos",
+            ),
+            pad_idx=self.source_dictionary.pad(),
+            left_pad=False,
+        )
+        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
+        src_dataset = NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": src_dataset,
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                },
+            },
+            sizes=src_lengths,
+        )
+        if sort:
+            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
+        return src_dataset
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
new file mode 100644
index 0000000000..cef7656691
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
@@ -0,0 +1,463 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import logging
+import os
+from collections import OrderedDict
+from argparse import ArgumentError
+
+import torch
+from fairseq import options, utils
+from fairseq.logging import metrics
+from fairseq.data import (
+    Dictionary,
+    LanguagePairDataset,
+    RoundRobinZipDatasets,
+    TransformEosLangPairDataset,
+)
+from fairseq.models import FairseqMultiModel
+from fairseq.tasks.translation import load_langpair_dataset
+
+from . import LegacyFairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+def _lang_token(lang: str):
+    return "__{}__".format(lang)
+
+
+def _lang_token_index(dic: Dictionary, lang: str):
+    """Return language token index."""
+    idx = dic.index(_lang_token(lang))
+    assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang)
+    return idx
+
+
+@register_task("multilingual_translation")
+class MultilingualTranslationTask(LegacyFairseqTask):
+    """A task for training multiple translation models simultaneously.
+
+    We iterate round-robin over batches from multiple language pairs, ordered
+    according to the `--lang-pairs` argument.
+
+    The training loop is roughly:
+
+        for i in range(len(epoch)):
+            for lang_pair in args.lang_pairs:
+                batch = next_batch_for_lang_pair(lang_pair)
+                loss = criterion(model_for_lang_pair(lang_pair), batch)
+                loss.backward()
+            optimizer.step()
+
+    In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset
+    (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that
+    implements the `FairseqMultiModel` interface.
+
+    During inference it is required to specify a single `--source-lang` and
+    `--target-lang`, which indicates the inference langauge direction.
+    `--lang-pairs`, `--encoder-langtok`, `--decoder-langtok` have to be set to
+    the same value as training.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('data', metavar='DIR', help='path to data directory')
+        parser.add_argument('--lang-pairs', default=None, metavar='PAIRS',
+                            help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr')
+        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
+                            help='source language (only needed for inference)')
+        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
+                            help='target language (only needed for inference)')
+        parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
+                            help='pad the source on the left (default: True)')
+        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
+                            help='pad the target on the left (default: False)')
+        try:
+            parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
+                                help='max number of tokens in the source sequence')
+            parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
+                                help='max number of tokens in the target sequence')
+        except ArgumentError:
+            # this might have already been defined. Once we transition this to hydra it should be fine to add it here.
+            pass
+        parser.add_argument('--upsample-primary', default=1, type=int,
+                            help='amount to upsample primary dataset')
+        parser.add_argument('--encoder-langtok', default=None, type=str, choices=['src', 'tgt'],
+                            metavar='SRCTGT',
+                            help='replace beginning-of-sentence in source sentence with source or target '
+                                 'language token. (src/tgt)')
+        parser.add_argument('--decoder-langtok', action='store_true',
+                            help='replace beginning-of-sentence in target sentence with target language token')
+        # fmt: on
+
+    def __init__(self, args, dicts, training):
+        super().__init__(args)
+        self.dicts = dicts
+        self.training = training
+        if training:
+            self.lang_pairs = args.lang_pairs
+        else:
+            self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)]
+        # eval_lang_pairs for multilingual translation is usually all of the
+        # lang_pairs. However for other multitask settings or when we want to
+        # optimize for certain languages we want to use a different subset. Thus
+        # the eval_lang_pairs class variable is provided for classes that extend
+        # this class.
+        self.eval_lang_pairs = self.lang_pairs
+        # model_lang_pairs will be used to build encoder-decoder model pairs in
+        # models.build_model(). This allows multitask type of sub-class can
+        # build models other than the input lang_pairs
+        self.model_lang_pairs = self.lang_pairs
+        self.langs = list(dicts.keys())
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        dicts, training = cls.prepare(args, **kwargs)
+        return cls(args, dicts, training)
+
+    @classmethod
+    def update_args(cls, args):
+        args.left_pad_source = utils.eval_bool(args.left_pad_source)
+        args.left_pad_target = utils.eval_bool(args.left_pad_target)
+
+        if args.lang_pairs is None:
+            raise ValueError(
+                "--lang-pairs is required. List all the language pairs in the training objective."
+            )
+        if isinstance(args.lang_pairs, str):
+            args.lang_pairs = args.lang_pairs.split(",")
+
+    @classmethod
+    def prepare(cls, args, **kargs):
+        cls.update_args(args)
+        sorted_langs = sorted(
+            list({x for lang_pair in args.lang_pairs for x in lang_pair.split("-")})
+        )
+        if args.source_lang is not None or args.target_lang is not None:
+            training = False
+        else:
+            training = True
+
+        # load dictionaries
+        dicts = OrderedDict()
+        for lang in sorted_langs:
+            paths = utils.split_paths(args.data)
+            assert len(paths) > 0
+            dicts[lang] = cls.load_dictionary(
+                os.path.join(paths[0], "dict.{}.txt".format(lang))
+            )
+            if len(dicts) > 0:
+                assert dicts[lang].pad() == dicts[sorted_langs[0]].pad()
+                assert dicts[lang].eos() == dicts[sorted_langs[0]].eos()
+                assert dicts[lang].unk() == dicts[sorted_langs[0]].unk()
+            if args.encoder_langtok is not None or args.decoder_langtok:
+                for lang_to_add in sorted_langs:
+                    dicts[lang].add_symbol(_lang_token(lang_to_add))
+            logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang])))
+        return dicts, training
+
+    def get_encoder_langtok(self, src_lang, tgt_lang):
+        if self.args.encoder_langtok is None:
+            return self.dicts[src_lang].eos()
+        if self.args.encoder_langtok == "src":
+            return _lang_token_index(self.dicts[src_lang], src_lang)
+        else:
+            return _lang_token_index(self.dicts[src_lang], tgt_lang)
+
+    def get_decoder_langtok(self, tgt_lang):
+        if not self.args.decoder_langtok:
+            return self.dicts[tgt_lang].eos()
+        return _lang_token_index(self.dicts[tgt_lang], tgt_lang)
+
+    def alter_dataset_langtok(
+        self,
+        lang_pair_dataset,
+        src_eos=None,
+        src_lang=None,
+        tgt_eos=None,
+        tgt_lang=None,
+    ):
+        if self.args.encoder_langtok is None and not self.args.decoder_langtok:
+            return lang_pair_dataset
+
+        new_src_eos = None
+        if (
+            self.args.encoder_langtok is not None
+            and src_eos is not None
+            and src_lang is not None
+            and tgt_lang is not None
+        ):
+            new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang)
+        else:
+            src_eos = None
+
+        new_tgt_bos = None
+        if self.args.decoder_langtok and tgt_eos is not None and tgt_lang is not None:
+            new_tgt_bos = self.get_decoder_langtok(tgt_lang)
+        else:
+            tgt_eos = None
+
+        return TransformEosLangPairDataset(
+            lang_pair_dataset,
+            src_eos=src_eos,
+            new_src_eos=new_src_eos,
+            tgt_bos=tgt_eos,
+            new_tgt_bos=new_tgt_bos,
+        )
+
+    def load_dataset(self, split, epoch=1, **kwargs):
+        """Load a dataset split."""
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        def language_pair_dataset(lang_pair):
+            src, tgt = lang_pair.split("-")
+            langpair_dataset = load_langpair_dataset(
+                data_path,
+                split,
+                src,
+                self.dicts[src],
+                tgt,
+                self.dicts[tgt],
+                combine=True,
+                dataset_impl=self.args.dataset_impl,
+                upsample_primary=self.args.upsample_primary,
+                left_pad_source=self.args.left_pad_source,
+                left_pad_target=self.args.left_pad_target,
+                max_source_positions=self.args.max_source_positions,
+                max_target_positions=self.args.max_target_positions,
+            )
+            return self.alter_dataset_langtok(
+                langpair_dataset,
+                src_eos=self.dicts[src].eos(),
+                src_lang=src,
+                tgt_eos=self.dicts[tgt].eos(),
+                tgt_lang=tgt,
+            )
+
+        self.datasets[split] = RoundRobinZipDatasets(
+            OrderedDict(
+                [
+                    (lang_pair, language_pair_dataset(lang_pair))
+                    for lang_pair in self.lang_pairs
+                ]
+            ),
+            eval_key=None
+            if self.training
+            else "%s-%s" % (self.args.source_lang, self.args.target_lang),
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
+        if constraints is not None:
+            raise NotImplementedError(
+                "Constrained decoding with the multilingual_translation task is not supported"
+            )
+
+        lang_pair = "%s-%s" % (self.args.source_lang, self.args.target_lang)
+        return RoundRobinZipDatasets(
+            OrderedDict(
+                [
+                    (
+                        lang_pair,
+                        self.alter_dataset_langtok(
+                            LanguagePairDataset(
+                                src_tokens, src_lengths, self.source_dictionary
+                            ),
+                            src_eos=self.source_dictionary.eos(),
+                            src_lang=self.args.source_lang,
+                            tgt_eos=self.target_dictionary.eos(),
+                            tgt_lang=self.args.target_lang,
+                        ),
+                    )
+                ]
+            ),
+            eval_key=lang_pair,
+        )
+
+    def build_model(self, args, from_checkpoint=False):
+        def check_args():
+            messages = []
+            if (
+                len(set(self.args.lang_pairs).symmetric_difference(args.lang_pairs))
+                != 0
+            ):
+                messages.append(
+                    "--lang-pairs should include all the language pairs {}.".format(
+                        args.lang_pairs
+                    )
+                )
+            if self.args.encoder_langtok != args.encoder_langtok:
+                messages.append(
+                    "--encoder-langtok should be {}.".format(args.encoder_langtok)
+                )
+            if self.args.decoder_langtok != args.decoder_langtok:
+                messages.append(
+                    "--decoder-langtok should {} be set.".format(
+                        "" if args.decoder_langtok else "not"
+                    )
+                )
+
+            if len(messages) > 0:
+                raise ValueError(" ".join(messages))
+
+        # Update args -> the fact that the constructor here
+        # changes the args object doesn't mean you get the same one here
+        self.update_args(args)
+
+        # Check if task args are consistant with model args
+        check_args()
+
+        from fairseq import models
+
+        model = models.build_model(args, self, from_checkpoint)
+        if not isinstance(model, FairseqMultiModel):
+            raise ValueError(
+                "MultilingualTranslationTask requires a FairseqMultiModel architecture"
+            )
+        return model
+
+    def _per_lang_pair_train_loss(
+        self, lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad
+    ):
+        loss, sample_size, logging_output = criterion(
+            model.models[lang_pair], sample[lang_pair]
+        )
+        if ignore_grad:
+            loss *= 0
+        optimizer.backward(loss)
+        return loss, sample_size, logging_output
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        model.train()
+        from collections import defaultdict
+
+        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, defaultdict(float)
+        curr_lang_pairs = [
+            lang_pair
+            for lang_pair in self.model_lang_pairs
+            if sample[lang_pair] is not None and len(sample[lang_pair]) != 0
+        ]
+
+        for idx, lang_pair in enumerate(curr_lang_pairs):
+
+            def maybe_no_sync():
+                if (
+                    self.args.distributed_world_size > 1
+                    and hasattr(model, "no_sync")
+                    and idx < len(curr_lang_pairs) - 1
+                ):
+                    return model.no_sync()
+                else:
+                    return contextlib.ExitStack()  # dummy contextmanager
+
+            with maybe_no_sync():
+                loss, sample_size, logging_output = self._per_lang_pair_train_loss(
+                    lang_pair,
+                    model,
+                    update_num,
+                    criterion,
+                    sample,
+                    optimizer,
+                    ignore_grad,
+                )
+            agg_loss += loss.detach().item()
+            # TODO make summing of the sample sizes configurable
+            agg_sample_size += sample_size
+            for k in logging_output:
+                agg_logging_output[k] += logging_output[k]
+                agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k]
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def _per_lang_pair_valid_loss(self, lang_pair, model, criterion, sample):
+        return criterion(model.models[lang_pair], sample[lang_pair])
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            from collections import defaultdict
+
+            agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, defaultdict(float)
+            for lang_pair in self.eval_lang_pairs:
+                if (
+                    lang_pair not in sample
+                    or sample[lang_pair] is None
+                    or len(sample[lang_pair]) == 0
+                ):
+                    continue
+                loss, sample_size, logging_output = self._per_lang_pair_valid_loss(
+                    lang_pair, model, criterion, sample
+                )
+                agg_loss += loss.data.item()
+                # TODO make summing of the sample sizes configurable
+                agg_sample_size += sample_size
+                for k in logging_output:
+                    agg_logging_output[k] += logging_output[k]
+                    agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k]
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            if self.args.decoder_langtok:
+                bos_token = _lang_token_index(
+                    self.target_dictionary, self.args.target_lang
+                )
+            else:
+                bos_token = self.target_dictionary.eos()
+            return generator.generate(
+                models,
+                sample,
+                prefix_tokens=prefix_tokens,
+                constraints=constraints,
+                bos_token=bos_token,
+            )
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        with metrics.aggregate():
+            # pass 'sample_size', 'nsentences', 'ntokens' stats to fairseq_task
+            super().reduce_metrics(logging_outputs, criterion)
+            for k in ["sample_size", "nsentences", "ntokens"]:
+                metrics.log_scalar(k, sum(l[k] for l in logging_outputs))
+
+    @property
+    def source_dictionary(self):
+        if self.training:
+            return next(iter(self.dicts.values()))
+        else:
+            return self.dicts[self.args.source_lang]
+
+    @property
+    def target_dictionary(self):
+        if self.training:
+            return next(iter(self.dicts.values()))
+        else:
+            return self.dicts[self.args.target_lang]
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        if len(self.datasets.values()) == 0:
+            return {
+                "%s-%s"
+                % (self.args.source_lang, self.args.target_lang): (
+                    self.args.max_source_positions,
+                    self.args.max_target_positions,
+                )
+            }
+        return OrderedDict(
+            [
+                (key, (self.args.max_source_positions, self.args.max_target_positions))
+                for split in self.datasets.keys()
+                for key in self.datasets[split].datasets.keys()
+            ]
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py
new file mode 100644
index 0000000000..a335021335
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import os
+import torch
+import json
+
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Optional, Any
+
+from fairseq.data import AddTargetDataset, Dictionary, encoders
+from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.configs import GenerationConfig
+from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel
+
+from . import register_task
+from .. import utils
+from ..logging import metrics
+
+
+logger = logging.getLogger(__name__)
+
+
+class LabelEncoder(object):
+    def __init__(self, dictionary):
+        self.dictionary = dictionary
+
+    def __call__(self, label):
+        return self.dictionary.encode_line(
+            label, append_eos=False, add_if_not_exist=False
+        )
+
+
+def label_len_fn(label):
+    return len(label.split(" "))
+
+
+@dataclass
+class NLUFinetuningConfig(AudioPretrainingConfig):
+    # Options for reporting WER metrics during validation. Only applicable to
+    # Seq2Seq models during fine-tuning
+    eval_wer: bool = field(
+        default=False, metadata={"help": "compute WER for Seq2Seq models"}
+    )
+    eval_wer_parse: bool = field(
+        default=False, metadata={"help": "compute WER for Seq2Seq models"}
+    )
+    eval_wer_config: GenerationConfig = field(
+        default_factory=lambda: GenerationConfig(),
+        metadata={"help": "beam search config for evaluating wer during training"},
+    )
+    eval_wer_tokenizer: Any = field(
+        default=None,
+        metadata={"help": "tokenizer config for evaluating wer during training"},
+    )
+    eval_wer_post_process: str = field(
+        default="letter",
+        metadata={
+            "help": "remove BPE tokens before scoring (can be sentencepiece, letter, and more)"
+        },
+    )
+    eval_bleu: bool = field(
+        default=False, metadata={"help": "evaluation with BLEU scores"}
+    )
+    eval_bleu_detok: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "detokenize before computing BLEU (e.g., 'moses'); "
+            "required if using --eval-bleu; use 'space' to disable "
+            "detokenization; see fairseq.data.encoders for other options"
+        },
+    )
+    eval_bleu_detok_args: str = field(
+        default="{}", metadata={"help": "args for building the tokenizer, if needed"}
+    )
+    eval_tokenized_bleu: bool = field(
+        default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}
+    )
+    eval_bleu_remove_bpe: Optional[str] = field(
+        default=None, metadata={"help": "remove BPE before computing BLEU"}
+    )
+    eval_bleu_args: str = field(
+        default="{}",
+        metadata={
+            "help": "generation args for BLUE scoring, e.g., "
+            '\'{"beam": 4, "lenpen": 0.6}\''
+        },
+    )
+    eval_bleu_print_samples: bool = field(
+        default=False, metadata={"help": "print sample generations during validation"}
+    )
+    autoregressive: bool = field(
+        default=False,
+        metadata={
+            "help": "required for autoregressive decoders (like seq2seq models); "
+            "adds 'prev_output_tokens' to input and appends eos to target"
+        },
+    )
+
+
+@register_task("nlu_finetuning", dataclass=NLUFinetuningConfig)
+class NLUFinetuningTask(AudioPretrainingTask):
+    """ """
+
+    cfg: NLUFinetuningConfig
+
+    def __init__(
+        self,
+        cfg: NLUFinetuningConfig,
+    ):
+        super().__init__(cfg)
+        self.blank_symbol = "<s>"
+
+        self.state.add_factory("target_dictionary", self.load_target_dictionary)
+
+    def load_target_dictionary(self):
+        if self.cfg.labels:
+            dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt")
+            return Dictionary.load(dict_path)
+        return None
+
+    def load_dataset(self, split: str, task_cfg: NLUFinetuningConfig = None, **kwargs):
+        super().load_dataset(split, task_cfg, **kwargs)
+
+        task_cfg = task_cfg or self.cfg
+        assert task_cfg.labels is not None
+        text_compression_level = getattr(
+            TextCompressionLevel, str(self.cfg.text_compression_level)
+        )
+        data_path = self.cfg.data
+        label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}")
+        skipped_indices = getattr(self.datasets[split], "skipped_indices", set())
+        text_compressor = TextCompressor(level=text_compression_level)
+        with open(label_path, "r") as f:
+            labels = [
+                text_compressor.compress(l)
+                for i, l in enumerate(f)
+                if i not in skipped_indices
+            ]
+
+        assert len(labels) == len(self.datasets[split]), (
+            f"labels length ({len(labels)}) and dataset length "
+            f"({len(self.datasets[split])}) do not match"
+        )
+
+        process_label = LabelEncoder(self.target_dictionary)
+
+        self.datasets[split] = AddTargetDataset(
+            self.datasets[split],
+            labels,
+            pad=self.target_dictionary.pad(),
+            eos=self.target_dictionary.eos(),
+            batch_targets=True,
+            process_label=process_label,
+            label_len_fn=label_len_fn,
+            add_to_input=task_cfg.get("autoregressive", False),
+            text_compression_level=text_compression_level,
+        )
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.state.target_dictionary
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        if self.cfg.eval_wer_parse and self.cfg.autoregressive:
+            metrics = self._inference_with_wer_parse(
+                self.sequence_generator, sample, model
+            )
+            logging_output["_num_char_errors"] = metrics["num_char_errors"]
+            logging_output["_num_chars"] = metrics["num_chars"]
+            logging_output["_num_word_errors"] = metrics["num_word_errors"]
+            logging_output["_num_words"] = metrics["num_words"]
+            logging_output["_num_em_errors"] = metrics["num_em_errors"]
+            logging_output["_num_ems"] = metrics["num_ems"]
+            logging_output["_num_tree_errors"] = metrics["num_tree_errors"]
+            logging_output["_num_trees"] = metrics["num_trees"]
+        if self.cfg.eval_wer and self.cfg.autoregressive:
+            metrics = self._inference_with_wer(self.sequence_generator, sample, model)
+            logging_output["_num_char_errors"] = metrics["num_char_errors"]
+            logging_output["_num_chars"] = metrics["num_chars"]
+            logging_output["_num_word_errors"] = metrics["num_word_errors"]
+            logging_output["_num_words"] = metrics["num_words"]
+        if self.cfg.eval_bleu and self.cfg.autoregressive:
+            metrics = self._inference_with_bleu(self.sequence_generator, sample, model)
+            logging_output["_bleu_sys_len"] = metrics.sys_len
+            logging_output["_bleu_ref_len"] = metrics.ref_len
+            # we split counts into separate entries so that they can be
+            # summed efficiently across workers using fast-stat-sync
+            assert len(metrics.counts) == 4
+            for i in range(4):
+                logging_output[f"_bleu_counts_{i}"] = metrics.counts[i]
+                logging_output[f"_bleu_totals_{i}"] = metrics.totals[i]
+        return loss, sample_size, logging_output
+
+    def build_model(self, model_cfg: FairseqDataclass):
+        model = super().build_model(model_cfg)
+
+        if (self.cfg.eval_wer or self.cfg.eval_wer_parse) and self.cfg.autoregressive:
+            self.sequence_generator = self.build_generator(
+                [model],
+                self.cfg.eval_wer_config,
+            )
+            if self.cfg.eval_wer_tokenizer:
+                self.tokenizer = encoders.build_tokenizer(self.cfg.eval_wer_tokenizer)
+            else:
+                self.tokenizer = None
+        if self.cfg.eval_bleu and self.cfg.autoregressive:
+            assert self.cfg.eval_bleu_detok is not None, (
+                "--eval-bleu-detok is required if using --eval-bleu; "
+                "try --eval-bleu-detok=moses (or --eval-bleu-detok=space "
+                "to disable detokenization, e.g., when using sentencepiece)"
+            )
+            detok_args = json.loads(self.cfg.eval_bleu_detok_args)
+            self.tokenizer = encoders.build_tokenizer(
+                Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args)
+            )
+            gen_args = json.loads(self.cfg.eval_bleu_args)
+            gen_args = Namespace(**gen_args)
+            self.sequence_generator = self.build_generator([model], gen_args)
+
+        return model
+
+    def _inference_with_wer_parse(self, generator, sample, model):
+        import editdistance
+
+        def decode(toks):
+            s = self.target_dictionary.string(
+                toks.int().cpu(),
+                self.cfg.eval_wer_post_process,
+                escape_unk=True,
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        def decode_to_list(toks):
+            def token_string(i):
+                if i == self.target_dictionary.unk():
+                    return self.target_dictionary.unk_string(False)
+                else:
+                    return self.target_dictionary[i]
+
+            return [token_string(i) for i in toks]
+
+        def is_ont_token(token):
+            return "[" in token or "]" in token
+
+        def post_process(l):
+            o = []
+            for w in l:
+                if w == self.target_dictionary.eos_word or w == "|":
+                    continue
+                if w == "_":
+                    o.append(" ")
+                else:
+                    o.append(w)
+                    if is_ont_token(w):
+                        o.append(" ")
+            return o
+
+        num_word_errors, num_char_errors = 0, 0
+        num_chars, num_words = 0, 0
+        num_em_errors, num_ems = 0, 0
+        num_tree_errors, num_trees = 0, 0
+        gen_out = self.inference_step(generator, [model], sample, None)
+        for i in range(len(gen_out)):
+            hyp_tokens = gen_out[i][0]["tokens"]
+            # hyp = decode(hyp_tokens)
+            ref_tokens = utils.strip_pad(
+                sample["target"][i], self.target_dictionary.pad()
+            )
+            # ref = decode(ref_tokens)
+            hyp_list = decode_to_list(hyp_tokens)
+            ref_list = decode_to_list(ref_tokens)
+
+            hyp_list = post_process(hyp_list)
+            ref_list = post_process(ref_list)
+
+            hyp = "".join(hyp_list).strip()
+            ref = "".join(ref_list).strip()
+            num_chars += len(ref)
+            num_char_errors += editdistance.eval(hyp, ref)
+            hyp_words = hyp.split()
+            ref_words = ref.split()
+            hyp_tree = [word for word in hyp_list if ("[" in word or "]" in word)]
+            ref_tree = [word for word in ref_list if ("[" in word or "]" in word)]
+            # num_word_errors += editdistance.eval(hyp_words, ref_words)
+            hyp_before = decode(hyp_tokens).split()
+            ref_before = decode(ref_tokens).split()
+
+            num_word_errors += editdistance.eval(hyp_before, ref_before)
+            num_words += len(ref_before)
+            if hyp != ref:
+                num_em_errors += 1
+            if hyp_tree != ref_tree:
+                num_tree_errors += 1
+            num_ems += 1
+            num_trees += 1
+
+        return {
+            "num_char_errors": num_char_errors,
+            "num_chars": num_chars,
+            "num_word_errors": num_word_errors,
+            "num_words": num_words,
+            "num_ems": num_ems,
+            "num_em_errors": num_em_errors,
+            "num_trees": num_trees,
+            "num_tree_errors": num_tree_errors,
+        }
+
+    def _inference_with_wer(self, generator, sample, model):
+        import editdistance
+
+        def decode(toks):
+            s = self.target_dictionary.string(
+                toks.int().cpu(),
+                self.cfg.eval_wer_post_process,
+                escape_unk=True,
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        num_word_errors, num_char_errors = 0, 0
+        num_chars, num_words = 0, 0
+        gen_out = self.inference_step(generator, [model], sample, None)
+        for i in range(len(gen_out)):
+            hyp = decode(gen_out[i][0]["tokens"])
+            ref = decode(
+                utils.strip_pad(sample["target"][i], self.target_dictionary.pad()),
+            )
+            num_char_errors += editdistance.eval(hyp, ref)
+            num_chars += len(ref)
+            hyp_words = hyp.split()
+            ref_words = ref.split()
+            num_word_errors += editdistance.eval(hyp_words, ref_words)
+            num_words += len(ref_words)
+
+        return {
+            "num_char_errors": num_char_errors,
+            "num_chars": num_chars,
+            "num_word_errors": num_word_errors,
+            "num_words": num_words,
+        }
+
+    def _inference_with_bleu(self, generator, sample, model):
+        import sacrebleu
+
+        def decode(toks, is_ref):
+            s = self.target_dictionary.string(
+                toks.int().cpu(),
+                self.cfg.eval_bleu_remove_bpe,
+                # The default unknown string in fairseq is `<unk>`, but
+                # this is tokenized by sacrebleu as `< unk >`, inflating
+                # BLEU scores. Instead, we use a somewhat more verbose
+                # alternative that is unlikely to appear in the real
+                # reference, but doesn't get split into multiple tokens.
+                unk_string=("UNKNOWNTOKENINREF" if is_ref else "UNKNOWNTOKENINHYP"),
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        gen_out = self.inference_step(generator, [model], sample)
+        hyps, refs = [], []
+        for i in range(len(gen_out)):
+            hyps.append(decode(gen_out[i][0]["tokens"], is_ref=False))
+            refs.append(
+                decode(
+                    utils.strip_pad(sample["target"][i], self.target_dictionary.pad()),
+                    is_ref=True,  # don't count <unk> as matches to the hypo
+                )
+            )
+        if self.cfg.eval_bleu_print_samples:
+            logger.info("H-{} {}".format(sample["id"][0], hyps[0]))
+            logger.info("T-{} {}".format(sample["id"][0], refs[0]))
+
+        eval_tokenization = "none" if self.cfg.eval_tokenized_bleu else "13a"
+        return sacrebleu.corpus_bleu(hyps, [refs], tokenize=eval_tokenization)
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+
+        if self.cfg.eval_wer or self.cfg.eval_wer_parse:
+            zero = torch.scalar_tensor(0.0)
+            num_char_errors = sum(
+                log.get("_num_char_errors", zero) for log in logging_outputs
+            )
+            num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs)
+            num_word_errors = sum(
+                log.get("_num_word_errors", zero) for log in logging_outputs
+            )
+            num_words = sum(log.get("_num_words", zero) for log in logging_outputs)
+            metrics.log_scalar("_num_char_errors", num_char_errors)
+            metrics.log_scalar("_num_chars", num_chars)
+            metrics.log_scalar("_num_word_errors", num_word_errors)
+            metrics.log_scalar("_num_words", num_words)
+            if num_chars > 0:
+                metrics.log_derived(
+                    "uer",
+                    lambda meters: meters["_num_char_errors"].sum
+                    * 100.0
+                    / meters["_num_chars"].sum
+                    if meters["_num_chars"].sum > 0
+                    else float("nan"),
+                )
+            if num_words > 0:
+                metrics.log_derived(
+                    "wer",
+                    lambda meters: meters["_num_word_errors"].sum
+                    * 100.0
+                    / meters["_num_words"].sum
+                    if meters["_num_words"].sum > 0
+                    else float("nan"),
+                )
+            if self.cfg.eval_wer_parse:
+                num_em_errors = sum(
+                    log.get("_num_em_errors", zero) for log in logging_outputs
+                )
+                num_ems = sum(log.get("_num_ems", zero) for log in logging_outputs)
+                metrics.log_scalar("_num_em_errors", num_em_errors)
+                metrics.log_scalar("_num_ems", num_ems)
+                num_tree_errors = sum(
+                    log.get("_num_tree_errors", zero) for log in logging_outputs
+                )
+                num_trees = sum(log.get("_num_trees", zero) for log in logging_outputs)
+                metrics.log_scalar("_num_tree_errors", num_tree_errors)
+                metrics.log_scalar("_num_trees", num_trees)
+
+                if num_ems > 0:
+                    metrics.log_derived(
+                        "em_error",
+                        lambda meters: meters["_num_em_errors"].sum
+                        * 100.0
+                        / meters["_num_ems"].sum
+                        if meters["_num_ems"].sum > 0
+                        else float("nan"),
+                    )
+                if num_trees > 0:
+                    metrics.log_derived(
+                        "tree_error",
+                        lambda meters: meters["_num_tree_errors"].sum
+                        * 100.0
+                        / meters["_num_trees"].sum
+                        if meters["_num_trees"].sum > 0
+                        else float("nan"),
+                    )
+
+        if self.cfg.eval_bleu:
+            len_keys = ["_bleu_sys_len", "_bleu_ref_len"]
+            count_keys = [f"_bleu_counts_{i}" for i in range(4)]
+            total_keys = [f"_bleu_totals_{i}" for i in range(4)]
+            for k in len_keys + count_keys + total_keys:
+                metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs))
+
+            import sacrebleu
+
+            metrics.log_derived(
+                "bleu",
+                lambda meters: sacrebleu.compute_bleu(
+                    correct=[meters[k].sum for k in count_keys],
+                    total=[meters[k].sum for k in total_keys],
+                    sys_len=meters["_bleu_sys_len"].sum,
+                    ref_len=meters["_bleu_ref_len"].sum,
+                    smooth_method="exp",
+                ).score,
+            )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
new file mode 100644
index 0000000000..da24fe8981
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
@@ -0,0 +1,683 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import json
+import logging
+import math
+import os
+from argparse import Namespace
+from collections import OrderedDict, defaultdict
+from pathlib import Path
+from typing import Dict, Sequence, Tuple
+from argparse import ArgumentError
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import fairseq
+from fairseq import options, utils
+from fairseq.logging import metrics
+from fairseq.data import (
+    FairseqDataset,
+    LanguagePairDataset,
+    NoisingDataset,
+    PrependTokenDataset,
+    RoundRobinZipDatasets,
+    TransformEosLangPairDataset,
+    data_utils,
+    encoders,
+)
+from fairseq.sequence_generator import SequenceGenerator
+from fairseq.tasks import register_task
+from fairseq.tasks.translation import TranslationTask, load_langpair_dataset
+
+logger = logging.getLogger(__name__)
+
+
+class PiecewiseLinearFn:
+    """Piecewise linear function. Can be configured with a string."""
+
+    def __init__(self, pieces: Sequence[Tuple[int, float]]):
+        assert pieces == sorted(
+            pieces
+        ), f"PiecewiseLinearFn configuration should be sorted, received: {pieces}"
+
+        self.pieces = pieces
+
+    def __call__(self, x: int) -> float:
+        for i, (x_a, y_a) in enumerate(self.pieces[:-1]):
+            x_b, y_b = self.pieces[i + 1]
+            if x_a <= x <= x_b:
+                return y_a + (x - x_a) * (y_b - y_a) / (x_b - x_a)
+
+        return self.pieces[-1][1]
+
+    @staticmethod
+    def from_string(configuration: str) -> "PiecewiseLinearFn":
+        """
+        Parse the configuration of lambda coefficient (for scheduling).
+        x = "3"                  # lambda will be a constant equal to x
+        x = "0:1,1000:0"         # lambda will start from 1 and linearly decrease
+                                 # to 0 during the first 1000 iterations
+        x = "0:0,1000:0,2000:1"  # lambda will be equal to 0 for the first 1000
+                                 # iterations, then will linearly increase to 1 until iteration 2000
+        """
+        if isinstance(configuration, float):
+            return PiecewiseLinearFn([(0, configuration)])
+
+        try:
+            parts = configuration.split(",")
+            if len(parts) == 1:
+                v = float(configuration)
+                return PiecewiseLinearFn([(0, v)])
+
+            split = [s.split(":") for s in parts]
+            pieces = [(int(t), float(v)) for t, v in split]
+            return PiecewiseLinearFn(pieces)
+        except Exception:
+            raise ValueError(
+                f"Invalid PiecewiseLinearFn configuration: {configuration!r}"
+            )
+
+    @staticmethod
+    def one() -> "PiecewiseLinearFn":
+        return PiecewiseLinearFn([(0, 1.0)])
+
+
+@register_task("online_backtranslation")
+class OnlineBackTranslationTask(TranslationTask):
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        # Generic translation args
+        parser.add_argument('data', help='colon separated path to data directories list, \
+                            will be iterated upon during epochs in round-robin manner; \
+                            however, valid and test data are always in the first directory to \
+                            avoid the need for repeating them in all directories')
+        parser.add_argument('--mono-langs', metavar='MONO_LANGS',
+                            help='monolingual languages for training')
+        parser.add_argument('--valid-lang-pairs', default=None, metavar='VALID_LANG_PAIRS',
+                            help='language pairs for validation')
+        parser.add_argument('--load-alignments', action='store_true',
+                            help='load the binarized alignments')
+        parser.add_argument('--left-pad-source', default='False', type=str, metavar='BOOL',
+                            help='pad the source on the left')
+        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
+                            help='pad the target on the left')
+        parser.add_argument('--upsample-primary', default=1, type=int,
+                            help='amount to upsample primary dataset')
+        try:
+            parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
+                                help='max number of tokens in the source sequence')
+            parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
+                                help='max number of tokens in the target sequence')
+        except ArgumentError:
+            # this might have already been defined. Once we transition this to hydra it should be fine to add it here.
+            pass
+        parser.add_argument('--truncate-source', action='store_true', default=False,
+                            help='truncate source to max-source-positions')
+        parser.add_argument('--num-batch-buckets', default=0, type=int, metavar='N',
+                            help='if >0, then bucket source and target lengths into N '
+                                 'buckets and pad accordingly; this is useful on TPUs '
+                                 'to minimize the number of compilations')
+
+        # Denoising args
+        parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N',
+                            help='maximum word shuffle distance for denoising autoencoding data generation')
+        parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N',
+                            help='word dropout probability for denoising autoencoding data generation')
+        parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N',
+                            help='word blanking probability for denoising autoencoding data generation')
+
+        # Backtranslation args
+        parser.add_argument('--lambda-bt', default="1.0", type=str, metavar='N',
+                            help='back-translation weight')
+        parser.add_argument('--lambda-dae', default="1.0", type=str, metavar='N',
+                            help='denoising auto-encoder weight')
+
+        # Evaluation args
+        parser.add_argument('--generate-one-by-one', action='store_true',
+                            help='generate one sentence at a time for backtranslation')
+
+        parser.add_argument('--eval-bleu', action='store_true',
+                            help='evaluation with BLEU scores')
+        parser.add_argument('--eval-bleu-detok', type=str, default="space",
+                            help='detokenize before computing BLEU (e.g., "moses"); '
+                                 'required if using --eval-bleu; use "space" to '
+                                 'disable detokenization; see fairseq.data.encoders '
+                                 'for other options')
+        parser.add_argument('--eval-bleu-detok-args', type=str, metavar='JSON',
+                            help='args for building the tokenizer, if needed')
+        parser.add_argument('--eval-tokenized-bleu', action='store_true', default=False,
+                            help='compute tokenized BLEU instead of sacrebleu')
+        parser.add_argument('--eval-bleu-remove-bpe', nargs='?', const='@@ ', default=None,
+                            help='remove BPE before computing BLEU')
+        parser.add_argument('--eval-bleu-args', type=str, metavar='JSON',
+                            help='generation args for BLUE scoring, '
+                                 'e.g., \'{"beam": 4, "lenpen": 0.6}\'')
+        parser.add_argument('--eval-bleu-print-samples', action='store_true',
+                            help='print sample generations during validation')
+        # fmt: on
+
+    def __init__(self, args, common_dict, mono_langs, valid_lang_pairs):
+        super().__init__(args, common_dict, common_dict)
+        self.common_dict = common_dict
+        self.mono_langs = mono_langs
+        self.valid_lang_pairs = valid_lang_pairs
+
+        self.SHOW_SAMPLES_INTERVAL = 1000
+        # Start by showing samples
+        self._show_samples_ctr = self.SHOW_SAMPLES_INTERVAL
+        self.SHOW_SAMPLES_NUMBER = 5
+        self.lambda_bt = PiecewiseLinearFn.from_string(args.lambda_bt)
+        self.lambda_dae = PiecewiseLinearFn.from_string(args.lambda_dae)
+
+        self.args = args
+        self.data = utils.split_paths(self.args.data)
+        if len(self.data) == 1:
+            shards = list(Path(self.data[0]).glob("shard*"))
+            if len(shards) > 0:
+                # keep this as strings, since it can also be a manifold path
+                old_data = self.data
+                self.data = [str(shard) for shard in shards]
+                logging.warning(f"Expanded data directory {old_data} to {self.data}")
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        args.left_pad_source = options.eval_bool(args.left_pad_source)
+        args.left_pad_target = options.eval_bool(args.left_pad_target)
+
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        assert args.mono_langs is not None
+
+        mono_langs = args.mono_langs.split(",")
+        valid_lang_pairs = args.valid_lang_pairs.split(",")
+
+        # load dictionary
+        dict_path = os.path.join(paths[0], "dict.txt")
+        common_dict = cls.load_dictionary(dict_path)
+
+        return cls(args, common_dict, mono_langs, valid_lang_pairs)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs) -> FairseqDataset:
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if split == "train":
+            data_path = self.data[(epoch - 1) % len(self.data)]
+            dataset = self.load_train_dataset(data_path)
+        else:
+            # valid/test should always be the same.
+            dataset = self.load_translation_dataset(split, self.data[0])
+
+        self.datasets[split] = dataset
+        return dataset
+
+    def load_train_dataset(self, data_path: str) -> FairseqDataset:
+        """The training dataset is made of backtranslation dataset and denoising dataset."""
+        data = []
+        for lang in self.mono_langs:
+            train_path = os.path.join(data_path, lang, "train")
+            # TODO: could we do the BT using denoise sample ?
+            # this would half the data loading work
+            data.append((f"{lang}-BT", self.load_bt_dataset(train_path, lang)))
+            data.append(
+                (f"{lang}-DENOISE", self.load_denoise_dataset(train_path, lang))
+            )
+
+        return RoundRobinZipDatasets(OrderedDict(data))
+
+    def _langpair_dataset(
+        self, src: FairseqDataset, tgt: FairseqDataset
+    ) -> LanguagePairDataset:
+        return LanguagePairDataset(
+            src,
+            src.sizes,
+            self.dictionary,
+            tgt=tgt,
+            tgt_sizes=tgt.sizes,
+            tgt_dict=self.dictionary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            # TODO: should we shuffle ? we are already sorting batch by sizes so ?
+            # shuffle=True,
+        )
+
+    def _prepend_lang_bos_to_target(
+        self, dataset: LanguagePairDataset, lang: str
+    ) -> LanguagePairDataset:
+        bos = _lang_token_index(self.dictionary, lang)
+        return TransformEosLangPairDataset(
+            dataset,
+            src_eos=self.dictionary.eos(),
+            new_src_eos=self.dictionary.eos(),
+            tgt_bos=self.dictionary.eos(),
+            new_tgt_bos=bos,
+        )
+
+    def load_bt_dataset(self, data_path: str, lang: str) -> FairseqDataset:
+        """The BT dataset is generated with (tgt, tgt) pairs.
+        The actual translation to a (generated_src, tgt) pair
+        is done on the fly during training.
+        """
+        mono_dataset = data_utils.load_indexed_dataset(
+            data_path, self.common_dict, self.args.dataset_impl
+        )
+        assert mono_dataset is not None, f"No dataset found for {lang}"
+
+        mono_dataset_src = PrependTokenDataset(
+            mono_dataset, _lang_token_index(self.dictionary, lang)
+        )
+
+        mono_dataset_bt = self._langpair_dataset(mono_dataset_src, mono_dataset)
+        logger.info(
+            f"mono_lang = {lang} "
+            f"lang token index = {_lang_token_index(self.dictionary, lang)} "
+            f"lang token = {_lang_token(lang)}"
+        )
+
+        mono_dataset_bt = self._prepend_lang_bos_to_target(mono_dataset_bt, lang)
+        return mono_dataset_bt
+
+    def load_denoise_dataset(self, data_path: str, lang: str) -> FairseqDataset:
+        """Classic denoising dataset"""
+        dataset = data_utils.load_indexed_dataset(
+            data_path, self.common_dict, self.args.dataset_impl
+        )
+        noisy_dataset = NoisingDataset(
+            dataset,
+            self.dictionary,
+            seed=1,
+            max_word_shuffle_distance=self.args.max_word_shuffle_distance,
+            word_dropout_prob=self.args.word_dropout_prob,
+            word_blanking_prob=self.args.word_blanking_prob,
+        )
+        noisy_dataset = PrependTokenDataset(
+            noisy_dataset, _lang_token_index(self.dictionary, lang)
+        )
+
+        clean_dataset = data_utils.load_indexed_dataset(
+            data_path, self.common_dict, self.args.dataset_impl
+        )
+        denoising_dataset = self._langpair_dataset(noisy_dataset, clean_dataset)
+        denoising_dataset = self._prepend_lang_bos_to_target(denoising_dataset, lang)
+        return denoising_dataset
+
+    def load_translation_dataset(
+        self, split: str, data_path: str, combine: bool = False
+    ):
+        # only judging with one language pair for the moment,
+        # since ConcatDataset doesn't work as expected
+        assert len(self.valid_lang_pairs) == 1, "For now..."
+        valid_lang_pair = self.valid_lang_pairs[0]
+        src, tgt = valid_lang_pair.split("-")
+
+        # use the same function than TranslationTask
+        src_tgt_dt = load_langpair_dataset(
+            data_path,
+            split,
+            src,
+            self.common_dict,
+            tgt,
+            self.common_dict,
+            combine=combine,
+            dataset_impl=self.args.dataset_impl,
+            upsample_primary=self.args.upsample_primary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            max_source_positions=self.args.max_source_positions,
+            max_target_positions=self.args.max_target_positions,
+            load_alignments=self.args.load_alignments,
+            truncate_source=self.args.truncate_source,
+            num_buckets=self.args.num_batch_buckets,
+            shuffle=(split != "test"),
+            prepend_bos_src=_lang_token_index(self.dictionary, src),
+        )
+
+        src_tgt_eos_dt = self._prepend_lang_bos_to_target(src_tgt_dt, tgt)
+        src_tgt_eos_dt.args = self.args
+        return src_tgt_eos_dt
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
+        raise NotImplementedError
+
+    def build_model(self, args, from_checkpoint=False):
+        # torch.autograd.set_detect_anomaly(True)
+        model = super().build_model(args, from_checkpoint)
+
+        add_secial_tokens_to_dict_and_model(self.common_dict, model, self.mono_langs)
+
+        self.sequence_generators = {}
+        for mono_lang in self.mono_langs:
+            self.sequence_generators[mono_lang] = SequenceGenerator(
+                [model],
+                tgt_dict=self.dictionary,
+                beam_size=1,
+                max_len_a=1.3,
+                max_len_b=5,
+                min_len=5,
+                # keep 1 to be able to prepend bos
+                max_len=model.max_decoder_positions() - 1,
+            )
+
+        if getattr(args, "eval_bleu", False):
+            assert getattr(args, "eval_bleu_detok", None) is not None, (
+                "--eval-bleu-detok is required if using --eval-bleu; "
+                "try --eval-bleu-detok=moses (or --eval-bleu-detok=space "
+                "to disable detokenization, e.g., when using sentencepiece)"
+            )
+            detok_args = json.loads(getattr(args, "eval_bleu_detok_args", "{}") or "{}")
+            self.tokenizer = encoders.build_tokenizer(
+                Namespace(
+                    tokenizer=getattr(args, "eval_bleu_detok", None), **detok_args
+                )
+            )
+
+            gen_args = json.loads(getattr(args, "eval_bleu_args", "{}") or "{}")
+            self.bleu_sequence_generator = self.build_generator(
+                [model], Namespace(**gen_args)
+            )
+
+        return model
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)
+
+    @property
+    def dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.common_dict
+
+    def display_samples_once_in_a_while(self, smp, mono_lang, other_lang):
+        self._show_samples_ctr += 1
+        if self._show_samples_ctr < self.SHOW_SAMPLES_INTERVAL:
+            return
+        self._show_samples_ctr = 0
+
+        ln = smp["net_input"]["src_tokens"].shape[0]
+
+        logger.info(
+            f"(r:{self.args.distributed_rank}) : "
+            f"{other_lang} ---> {mono_lang} "
+            f"({other_lang} was generated by back-translation.) {ln} samples"
+        )
+
+        for i in range(min(ln, self.SHOW_SAMPLES_NUMBER)):
+            src_tokens = smp["net_input"]["src_tokens"][i]
+            tgt_tokens = smp["target"][i]
+
+            src_str = self.dictionary.string(src_tokens, "sentencepiece")
+            tgt_str = self.dictionary.string(tgt_tokens, "sentencepiece")
+            logger.info(
+                f"\n{i}\t\t[{other_lang} generated]  {src_str}\n"
+                f"\t\t[{mono_lang} original ]  {tgt_str}\n"
+                f"\t\t[ src tokens]  {src_tokens}\n"
+            )
+
+    def backtranslate_sample(self, smp, orig_lang, other_lang) -> None:
+        """
+        * WARNING: smp is modified in place.
+        * At the start of this function, `smp` has the same input and target:
+          |--------------------------------------------------------|
+          | smp['net_input']['src_tokens'] |  smp['target']        |
+          | (from data) __en__ hello world |  __en__ hello world   |
+          |--------------------------------------------------------|
+
+        * We call generator.generate(smp, bos_token = token("ro")),
+        and copy the result as input
+        * At the end, `smp` has the translation to other language.
+          |--------------------------------------------------------|
+          | smp['net_input']['src_tokens'] |  smp['target']        |
+          | (generated) __ro__ salut lume  |  __en__ hello world   |
+          |--------------------------------------------------------|
+
+        """
+        bos_token = _lang_token_index(self.dictionary, other_lang)
+        generated = self.sequence_generators[orig_lang].generate(
+            models=[], sample=smp, bos_token=bos_token
+        )
+
+        max_lngth = max([gn[0]["tokens"].size(0) for gn in generated])
+        net_input = smp["net_input"]
+        n_src_tokens = torch.empty(
+            size=(len(generated), max_lngth + 1), dtype=net_input["src_tokens"].dtype
+        )
+        n_src_lengths = torch.empty(
+            len(generated), dtype=net_input["src_lengths"].dtype
+        )
+
+        for i, gn in enumerate(generated):
+            tokens = gn[0]["tokens"]
+            tokens_size = tokens.size(0)
+            padding_needed = max_lngth - tokens_size
+            tokens = torch.cat([tokens.new([bos_token]), tokens])
+            tokens = F.pad(tokens, (0, padding_needed), value=self.dictionary.pad())
+            n_src_tokens[i] = tokens
+            n_src_lengths[i] = tokens_size + 1
+
+        device = net_input["src_tokens"].device
+        # This seems to be important
+        del net_input["src_tokens"]
+        del net_input["src_lengths"]
+        net_input["src_tokens"] = n_src_tokens.to(device)
+        net_input["src_lengths"] = n_src_lengths.to(device)
+
+    def generate(self, smp, model):
+        model.eval()
+        orig_lang = (
+            self.dictionary[smp["net_input"]["src_tokens"][0][0]]
+            .replace(" ", "")
+            .replace("_", "")
+        )
+        bos_token = smp["net_input"]["prev_output_tokens"][0][0]
+        with torch.no_grad():
+            generated = self.sequence_generators[orig_lang].generate(
+                models=[model], sample=smp, bos_token=bos_token
+            )
+        return generated
+
+    def get_other_lang(self, lang):
+        # TODO: allow more complex mapping
+        if lang != self.mono_langs[0]:
+            return self.mono_langs[0]
+        if len(self.mono_langs) == 2:
+            return self.mono_langs[1]
+        return self.mono_langs[np.random.randint(1, len(self.mono_langs))]
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+
+        model.train()
+        model.set_num_updates(update_num)
+
+        agg_loss, agg_sample_size = 0.0, 0.0
+        agg_logging_output: Dict[str, float] = defaultdict(float)
+
+        dataset_keys = self.datasets["train"].datasets.keys()
+
+        weights = {
+            "BT": self.lambda_bt(update_num),
+            "DENOISE": self.lambda_dae(update_num),
+        }
+        log_keys = {"BT": "bt_", "DENOISE": "dae_"}
+
+        for dataset_key in dataset_keys:
+            smp = sample[dataset_key]
+            mono_lang, task_subtype = dataset_key.split("-")
+            if weights[task_subtype] == 0:
+                continue
+
+            if task_subtype == "BT":
+                with torch.autograd.profiler.record_function("backtranslation"):
+                    model.eval()
+                    # TODO: Could we translate to several language at once ?
+                    # this would allow to share encoder_out and maximize GPU usage.
+                    other_lang = self.get_other_lang(mono_lang)
+                    self.backtranslate_sample(smp, mono_lang, other_lang)
+                    self.display_samples_once_in_a_while(smp, mono_lang, other_lang)
+                    model.train()
+
+            # Like in FairseqTask.train_step
+            with torch.autograd.profiler.record_function("forward"):
+                loss, sample_size, logging_output = criterion(model, smp)
+            loss *= weights[task_subtype]
+            if ignore_grad:
+                loss *= 0
+            with torch.autograd.profiler.record_function("backward"):
+                optimizer.backward(loss)
+
+            agg_loss += loss.item()
+            agg_sample_size += sample_size
+            for k in logging_output:
+                agg_logging_output[log_keys[task_subtype] + k] += logging_output[k]
+                agg_logging_output[k] += logging_output[k]
+
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def get_bos_token_from_sample(self, sample):
+        net_input = sample["net_input"]
+        source_lang_token_id = torch.unique(net_input["src_tokens"][:, 0]).item()
+        source_lang_token = self.dictionary[source_lang_token_id].replace("_", "")
+        target_lang_token_id = _lang_token_index(
+            self.dictionary, self.get_other_lang(source_lang_token)
+        )
+
+        return target_lang_token_id
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        bt_sample_size = sum(x.get("bt_sample_size", 0) for x in logging_outputs)
+        if bt_sample_size:
+            bt_loss_sum = sum(x.get("bt_loss", 0) for x in logging_outputs)
+            bt_loss_sum *= 1 / bt_sample_size / math.log(2)
+            metrics.log_scalar("bt_loss", bt_loss_sum, bt_sample_size, round=3)
+
+            bt_nll_loss_sum = sum(x.get("bt_nll_loss", 0) for x in logging_outputs)
+            bt_ntokens = sum(x.get("bt_ntokens", 0) for x in logging_outputs)
+            bt_nll_loss_sum *= 1 / bt_ntokens / math.log(2)
+            metrics.log_scalar("bt_nll_loss", bt_nll_loss_sum, bt_ntokens, round=3)
+            metrics.log_derived(
+                "bt_ppl", lambda meters: utils.get_perplexity(meters["bt_nll_loss"].avg)
+            )
+
+        dae_sample_size = sum(x.get("dae_sample_size", 0) for x in logging_outputs)
+        if dae_sample_size:
+            dae_loss_sum = sum(x.get("dae_loss", 0) for x in logging_outputs)
+            dae_loss_sum *= 1 / dae_sample_size / math.log(2)
+            metrics.log_scalar("dae_loss", dae_loss_sum, dae_sample_size, round=3)
+
+            dae_nll_loss_sum = sum(x.get("dae_nll_loss", 0) for x in logging_outputs)
+            dae_ntokens = sum(x.get("dae_ntokens", 0) for x in logging_outputs)
+            dae_nll_loss_sum *= 1 / dae_ntokens / math.log(2)
+            metrics.log_scalar("dae_nll_loss", dae_nll_loss_sum, dae_ntokens, round=3)
+            metrics.log_derived(
+                "dae_ppl",
+                lambda meters: utils.get_perplexity(meters["dae_nll_loss"].avg),
+            )
+
+
+@torch.no_grad()
+def extend_embedding(
+    emb: nn.Module, new_vocab_size: int, copy_from_token_id: int
+) -> None:
+    old_emb_data = emb.weight.data
+    (old_vocab_size, dim) = old_emb_data.shape
+    assert new_vocab_size >= old_vocab_size
+
+    if new_vocab_size > old_vocab_size:
+        emb.weight.data = torch.zeros((new_vocab_size, dim))
+        emb.weight.data[:old_vocab_size, :] = old_emb_data
+        # initialize new embeddings
+        emb.weight.data[old_vocab_size:, :] = old_emb_data[copy_from_token_id]
+        if hasattr(emb, "num_embeddings"):
+            emb.num_embeddings = new_vocab_size
+        if hasattr(emb, "out_features"):
+            emb.out_features = new_vocab_size
+
+    if getattr(emb, "bias", None) is None:
+        return
+
+    # Fix the bias.
+    # Bias shape can be different from the previous vocab size
+    # if the weight matrix was shared and alread extended but not the bias.
+    (old_vocab_size,) = emb.bias.shape
+    assert new_vocab_size >= old_vocab_size
+    if new_vocab_size > old_vocab_size:
+        old_bias = emb.bias.data
+        new_bias = torch.zeros(
+            (new_vocab_size,), dtype=old_bias.dtype, device=old_bias.device
+        )
+        new_bias[:old_vocab_size] = old_bias
+        emb.bias.data = new_bias
+
+
+def add_secial_tokens_to_dict_and_model(
+    dictionary: "fairseq.data.Dictionary",
+    model: nn.Module,
+    mono_langs: Sequence[str],
+) -> None:
+    embs = model.encoder.embed_tokens
+    vocab_size, embedding_dim = embs.weight.shape
+
+    # The model may or may not have a '<mask>' embedding yet
+    assert (
+        len(dictionary) <= vocab_size <= len(dictionary) + 1
+    ), f"Dictionary len ({len(dictionary)}) doesn't match embs shape ({embs.weight.shape})"
+    # TODO: we should reuse the pretrained model dict which already has <mask>
+    dictionary.add_symbol("<mask>")
+
+    for lang in mono_langs:
+        lang_token = _lang_token(lang)
+        dictionary.add_symbol(lang_token)
+    logger.info(
+        f"dictionary: {len(dictionary)} -> {vocab_size} tokens "
+        f"after adding {len(mono_langs)} lang tokens."
+    )
+
+    if len(dictionary) <= vocab_size:
+        return
+
+    extend_embedding(embs, len(dictionary), dictionary.bos())
+    dec_embs = model.decoder.embed_tokens
+    extend_embedding(dec_embs, len(dictionary), dictionary.bos())
+    lm_head = model.decoder.output_projection
+    extend_embedding(lm_head, len(dictionary), dictionary.bos())
+    assert lm_head.weight.shape == (len(dictionary), embedding_dim)
+
+
+def _lang_token(lang: str) -> str:
+    return f"__{lang}__"
+
+
+def _lang_token_index(dictionary, lang: str) -> int:
+    return dictionary.index(_lang_token(lang))
+
+
+@contextlib.contextmanager
+def assert_weights_have_changed(model: nn.Module):
+    def checksum(model: nn.Module) -> float:
+        return sum(p.sum().item() for p in model.parameters())
+
+    initial_checksum = checksum(model)
+    yield model
+    final_checksum = checksum(model)
+    logger.info(
+        f"initial_checksum={initial_checksum} -> final_checksum={final_checksum}"
+    )
+    assert initial_checksum != final_checksum, "Model hasn't changed !"
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
new file mode 100644
index 0000000000..432b8a52ca
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
@@ -0,0 +1,485 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from collections import OrderedDict
+
+from fairseq import utils
+from fairseq.data import (
+    BacktranslationDataset,
+    IndexedCachedDataset,
+    IndexedDataset,
+    IndexedRawTextDataset,
+    LanguagePairDataset,
+    NoisingDataset,
+    RoundRobinZipDatasets,
+    data_utils,
+    indexed_dataset,
+)
+from fairseq.models import FairseqMultiModel
+from fairseq.sequence_generator import SequenceGenerator
+
+from . import register_task
+from .multilingual_translation import MultilingualTranslationTask
+
+
+logger = logging.getLogger(__name__)
+
+
+def _get_bt_dataset_key(lang_pair):
+    return "bt:" + lang_pair
+
+
+def _get_denoising_dataset_key(lang_pair):
+    return "denoising:" + lang_pair
+
+
+# ported from UnsupervisedMT
+def parse_lambda_config(x):
+    """
+    Parse the configuration of lambda coefficient (for scheduling).
+    x = "3"                  # lambda will be a constant equal to x
+    x = "0:1,1000:0"         # lambda will start from 1 and linearly decrease
+                             # to 0 during the first 1000 iterations
+    x = "0:0,1000:0,2000:1"  # lambda will be equal to 0 for the first 1000
+                             # iterations, then will linearly increase to 1 until iteration 2000
+    """
+    split = x.split(",")
+    if len(split) == 1:
+        return float(x), None
+    else:
+        split = [s.split(os.pathsep) for s in split]
+        assert all(len(s) == 2 for s in split)
+        assert all(k.isdigit() for k, _ in split)
+        assert all(
+            int(split[i][0]) < int(split[i + 1][0]) for i in range(len(split) - 1)
+        )
+        return float(split[0][1]), [(int(k), float(v)) for k, v in split]
+
+
+@register_task("semisupervised_translation")
+class SemisupervisedTranslationTask(MultilingualTranslationTask):
+    """A task for training multiple translation models simultaneously.
+
+    We iterate round-robin over batches from multiple language pairs, ordered
+    according to the `--lang-pairs` argument.
+
+    The training loop is roughly:
+
+        for i in range(len(epoch)):
+            for lang_pair in args.lang_pairs:
+                batch = next_batch_for_lang_pair(lang_pair)
+                loss = criterion(model_for_lang_pair(lang_pair), batch)
+                loss.backward()
+            optimizer.step()
+
+    In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset
+    (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that
+    implements the `FairseqMultiModel` interface.
+
+    During inference it is required to specify a single `--source-lang` and
+    `--target-lang`, instead of `--lang-pairs`.
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        MultilingualTranslationTask.add_args(parser)
+        parser.add_argument('--lambda-parallel-config', default="1.0", type=str, metavar='CONFIG',
+                            help='cross-entropy reconstruction coefficient (parallel data). '
+                                 'use fixed weight during training if set to floating point number. '
+                                 'use piecewise linear function over number of updates to schedule the '
+                                 'weight with the format: w0:step0,w1:step1,...')
+        parser.add_argument('--lambda-denoising-config', default="0.0", type=str, metavar='CONFIG',
+                            help='Cross-entropy reconstruction coefficient (denoising autoencoding)'
+                                 'use fixed weight during training if set to floating point number. '
+                                 'use piecewise linear function over number of updates to schedule the '
+                                 'weight with the format: w0:step0,w1:step1,...')
+        parser.add_argument('--lambda-otf-bt-config', default="0.0", type=str, metavar='CONFIG',
+                            help='cross-entropy reconstruction coefficient (on-the-fly back-translation parallel data)'
+                                 'use fixed weight during training if set to floating point number. '
+                                 'use piecewise linear function over number of updates to schedule the '
+                                 'weight with the format: w0:step0,w1:step1,...')
+        parser.add_argument('--bt-max-len-a', default=1.1, type=float, metavar='N',
+                            help='generate back-translated sequences of maximum length ax + b, where x is the '
+                                 'source length')
+        parser.add_argument('--bt-max-len-b', default=10.0, type=float, metavar='N',
+                            help='generate back-translated sequences of maximum length ax + b, where x is the '
+                                 'source length')
+        parser.add_argument('--bt-beam-size', default=1, type=int, metavar='N',
+                            help='beam size used in beam search of online back-translation')
+        parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N',
+                            help='maximum word shuffle distance for denoising autoencoding data generation')
+        parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N',
+                            help='word dropout probability for denoising autoencoding data generation')
+        parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N',
+                            help='word blanking probability for denoising autoencoding data generation')
+        # fmt: on
+
+    def __init__(self, args, dicts, training):
+        super().__init__(args, dicts, training)
+        self.lambda_parallel, self.lambda_parallel_steps = parse_lambda_config(
+            args.lambda_parallel_config
+        )
+        self.lambda_otf_bt, self.lambda_otf_bt_steps = parse_lambda_config(
+            args.lambda_otf_bt_config
+        )
+        self.lambda_denoising, self.lambda_denoising_steps = parse_lambda_config(
+            args.lambda_denoising_config
+        )
+        if self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None:
+            denoising_lang_pairs = [
+                "%s-%s" % (tgt, tgt)
+                for tgt in {lang_pair.split("-")[1] for lang_pair in args.lang_pairs}
+            ]
+            self.model_lang_pairs = self.model_lang_pairs + denoising_lang_pairs
+        self.backtranslate_datasets = {}
+        self.backtranslators = {}
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        dicts, training = MultilingualTranslationTask.prepare(args, **kwargs)
+        return cls(args, dicts, training)
+
+    def load_dataset(self, split, epoch=1, **kwargs):
+        """Load a dataset split."""
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        def split_exists(split, src, tgt, lang):
+            if src is not None:
+                filename = os.path.join(
+                    data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)
+                )
+            else:
+                filename = os.path.join(
+                    data_path, "{}.{}-None.{}".format(split, src, tgt)
+                )
+            return indexed_dataset.dataset_exists(filename, impl=self.args.dataset_impl)
+
+        def load_indexed_dataset(path, dictionary):
+            return data_utils.load_indexed_dataset(
+                path, dictionary, self.args.dataset_impl
+            )
+
+        # load parallel datasets
+        src_datasets, tgt_datasets = {}, {}
+        if (
+            self.lambda_parallel > 0.0
+            or self.lambda_parallel_steps is not None
+            or not split.startswith("train")
+        ):
+            for lang_pair in self.lang_pairs:
+                src, tgt = lang_pair.split("-")
+                if split_exists(split, src, tgt, src):
+                    prefix = os.path.join(
+                        data_path, "{}.{}-{}.".format(split, src, tgt)
+                    )
+                elif split_exists(split, tgt, src, src):
+                    prefix = os.path.join(
+                        data_path, "{}.{}-{}.".format(split, tgt, src)
+                    )
+                else:
+                    continue
+                src_datasets[lang_pair] = load_indexed_dataset(
+                    prefix + src, self.dicts[src]
+                )
+                tgt_datasets[lang_pair] = load_indexed_dataset(
+                    prefix + tgt, self.dicts[tgt]
+                )
+                logger.info(
+                    "parallel-{} {} {} examples".format(
+                        data_path, split, len(src_datasets[lang_pair])
+                    )
+                )
+            if len(src_datasets) == 0:
+                raise FileNotFoundError(
+                    "Dataset not found: {} ({})".format(split, data_path)
+                )
+
+        # back translation datasets
+        backtranslate_datasets = {}
+        if (
+            self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None
+        ) and split.startswith("train"):
+            for lang_pair in self.lang_pairs:
+                src, tgt = lang_pair.split("-")
+                if not split_exists(split, tgt, None, tgt):
+                    raise FileNotFoundError(
+                        "Dataset not found: backtranslation {} ({})".format(
+                            split, data_path
+                        )
+                    )
+                filename = os.path.join(
+                    data_path, "{}.{}-None.{}".format(split, tgt, tgt)
+                )
+                dataset = load_indexed_dataset(filename, self.dicts[tgt])
+                lang_pair_dataset_tgt = LanguagePairDataset(
+                    dataset,
+                    dataset.sizes,
+                    self.dicts[tgt],
+                    left_pad_source=self.args.left_pad_source,
+                    left_pad_target=self.args.left_pad_target,
+                )
+                lang_pair_dataset = LanguagePairDataset(
+                    dataset,
+                    dataset.sizes,
+                    src_dict=self.dicts[src],
+                    tgt=dataset,
+                    tgt_sizes=dataset.sizes,
+                    tgt_dict=self.dicts[tgt],
+                    left_pad_source=self.args.left_pad_source,
+                    left_pad_target=self.args.left_pad_target,
+                )
+                backtranslate_datasets[lang_pair] = BacktranslationDataset(
+                    tgt_dataset=self.alter_dataset_langtok(
+                        lang_pair_dataset_tgt,
+                        src_eos=self.dicts[tgt].eos(),
+                        src_lang=tgt,
+                        tgt_lang=src,
+                    ),
+                    backtranslation_fn=self.backtranslators[lang_pair],
+                    src_dict=self.dicts[src],
+                    tgt_dict=self.dicts[tgt],
+                    output_collater=self.alter_dataset_langtok(
+                        lang_pair_dataset=lang_pair_dataset,
+                        src_eos=self.dicts[src].eos(),
+                        src_lang=src,
+                        tgt_eos=self.dicts[tgt].eos(),
+                        tgt_lang=tgt,
+                    ).collater,
+                )
+                logger.info(
+                    "backtranslate-{}: {} {} {} examples".format(
+                        tgt,
+                        data_path,
+                        split,
+                        len(backtranslate_datasets[lang_pair]),
+                    )
+                )
+                self.backtranslate_datasets[lang_pair] = backtranslate_datasets[
+                    lang_pair
+                ]
+
+        # denoising autoencoder
+        noising_datasets = {}
+        if (
+            self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None
+        ) and split.startswith("train"):
+            for lang_pair in self.lang_pairs:
+                _, tgt = lang_pair.split("-")
+                if not split_exists(split, tgt, None, tgt):
+                    continue
+                filename = os.path.join(
+                    data_path, "{}.{}-None.{}".format(split, tgt, tgt)
+                )
+                tgt_dataset1 = load_indexed_dataset(filename, self.dicts[tgt])
+                tgt_dataset2 = load_indexed_dataset(filename, self.dicts[tgt])
+                noising_dataset = NoisingDataset(
+                    tgt_dataset1,
+                    self.dicts[tgt],
+                    seed=1,
+                    max_word_shuffle_distance=self.args.max_word_shuffle_distance,
+                    word_dropout_prob=self.args.word_dropout_prob,
+                    word_blanking_prob=self.args.word_blanking_prob,
+                )
+                noising_datasets[lang_pair] = self.alter_dataset_langtok(
+                    LanguagePairDataset(
+                        noising_dataset,
+                        tgt_dataset1.sizes,
+                        self.dicts[tgt],
+                        tgt_dataset2,
+                        tgt_dataset2.sizes,
+                        self.dicts[tgt],
+                        left_pad_source=self.args.left_pad_source,
+                        left_pad_target=self.args.left_pad_target,
+                    ),
+                    src_eos=self.dicts[tgt].eos(),
+                    src_lang=tgt,
+                    tgt_eos=self.dicts[tgt].eos(),
+                    tgt_lang=tgt,
+                )
+                logger.info(
+                    "denoising-{}: {} {} {} examples".format(
+                        tgt,
+                        data_path,
+                        split,
+                        len(noising_datasets[lang_pair]),
+                    )
+                )
+
+        def language_pair_dataset(lang_pair):
+            src, tgt = lang_pair.split("-")
+            src_dataset, tgt_dataset = src_datasets[lang_pair], tgt_datasets[lang_pair]
+            return self.alter_dataset_langtok(
+                LanguagePairDataset(
+                    src_dataset,
+                    src_dataset.sizes,
+                    self.dicts[src],
+                    tgt_dataset,
+                    tgt_dataset.sizes,
+                    self.dicts[tgt],
+                    left_pad_source=self.args.left_pad_source,
+                    left_pad_target=self.args.left_pad_target,
+                ),
+                self.dicts[src].eos(),
+                src,
+                self.dicts[tgt].eos(),
+                tgt,
+            )
+
+        self.datasets[split] = RoundRobinZipDatasets(
+            OrderedDict(
+                [
+                    (lang_pair, language_pair_dataset(lang_pair))
+                    for lang_pair in src_datasets.keys()
+                ]
+                + [
+                    (_get_bt_dataset_key(lang_pair), dataset)
+                    for lang_pair, dataset in backtranslate_datasets.items()
+                ]
+                + [
+                    (_get_denoising_dataset_key(lang_pair), dataset)
+                    for lang_pair, dataset in noising_datasets.items()
+                ]
+            ),
+            eval_key=None
+            if self.training
+            else "%s-%s" % (self.args.source_lang, self.args.target_lang),
+        )
+
+    def build_model(self, args, from_checkpoint=False):
+        from fairseq import models
+
+        model = models.build_model(args, self, from_checkpoint)
+        if not isinstance(model, FairseqMultiModel):
+            raise ValueError(
+                "SemisupervisedTranslationTask requires a FairseqMultiModel architecture"
+            )
+
+        # create SequenceGenerator for each model that has backtranslation dependency on it
+        self.sequence_generators = {}
+        if (
+            self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None
+        ) and self.training:
+            for lang_pair in self.lang_pairs:
+                src, tgt = lang_pair.split("-")
+                key = "{}-{}".format(tgt, src)
+                self.sequence_generators[key] = SequenceGenerator(
+                    [model.models[key]],
+                    tgt_dict=self.dicts[src],
+                    beam_size=args.bt_beam_size,
+                    max_len_a=args.bt_max_len_a,
+                    max_len_b=args.bt_max_len_b,
+                )
+                decoder_lang_tok_idx = self.get_decoder_langtok(src)
+
+                def backtranslate_fn(
+                    sample,
+                    model=model.models[key],
+                    bos_token=decoder_lang_tok_idx,
+                    sequence_generator=self.sequence_generators[key],
+                ):
+                    return sequence_generator.generate(
+                        [model],
+                        sample,
+                        bos_token=bos_token,
+                    )
+
+                self.backtranslators[lang_pair] = backtranslate_fn
+
+        return model
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        model.train()
+
+        if update_num > 0:
+            self.update_step(update_num)
+
+        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {}
+
+        def forward_backward(model, samples, logging_output_key, weight):
+            nonlocal agg_loss, agg_sample_size, agg_logging_output
+            if samples is None or len(samples) == 0:
+                return
+            loss, sample_size, logging_output = criterion(model, samples)
+            if ignore_grad:
+                loss *= 0
+            else:
+                loss *= weight
+            optimizer.backward(loss)
+            agg_loss += loss.detach().item()
+            # TODO make summing of the sample sizes configurable
+            agg_sample_size += sample_size
+            for k in logging_output:
+                agg_logging_output[k] += logging_output[k]
+                agg_logging_output[logging_output_key] += logging_output[k]
+
+        if self.lambda_parallel > 0.0:
+            for lang_pair in self.lang_pairs:
+                forward_backward(
+                    model.models[lang_pair],
+                    sample[lang_pair],
+                    lang_pair,
+                    self.lambda_parallel,
+                )
+
+        if self.lambda_otf_bt > 0.0:
+            for lang_pair in self.lang_pairs:
+                sample_key = _get_bt_dataset_key(lang_pair)
+                forward_backward(
+                    model.models[lang_pair],
+                    sample[sample_key],
+                    sample_key,
+                    self.lambda_otf_bt,
+                )
+
+        if self.lambda_denoising > 0.0:
+            for lang_pair in self.lang_pairs:
+                _, tgt = lang_pair.split("-")
+                sample_key = _get_denoising_dataset_key(lang_pair)
+                forward_backward(
+                    model.models["{0}-{0}".format(tgt)],
+                    sample[sample_key],
+                    sample_key,
+                    self.lambda_denoising,
+                )
+
+        return agg_loss, agg_sample_size, agg_logging_output
+
+    def update_step(self, num_updates):
+        def lambda_step_func(config, n_iter):
+            """
+            Update a lambda value according to its schedule configuration.
+            """
+            ranges = [
+                i
+                for i in range(len(config) - 1)
+                if config[i][0] <= n_iter < config[i + 1][0]
+            ]
+            if len(ranges) == 0:
+                assert n_iter >= config[-1][0]
+                return config[-1][1]
+            assert len(ranges) == 1
+            i = ranges[0]
+            x_a, y_a = config[i]
+            x_b, y_b = config[i + 1]
+            return y_a + (n_iter - x_a) * float(y_b - y_a) / float(x_b - x_a)
+
+        if self.lambda_parallel_steps is not None:
+            self.lambda_parallel = lambda_step_func(
+                self.lambda_parallel_steps, num_updates
+            )
+        if self.lambda_denoising_steps is not None:
+            self.lambda_denoising = lambda_step_func(
+                self.lambda_denoising_steps, num_updates
+            )
+        if self.lambda_otf_bt_steps is not None:
+            self.lambda_otf_bt = lambda_step_func(self.lambda_otf_bt_steps, num_updates)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
new file mode 100644
index 0000000000..de80addaf2
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
@@ -0,0 +1,303 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import contextlib
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import MISSING, II, open_dict, OmegaConf
+
+import numpy as np
+from fairseq.data import (
+    ConcatSentencesDataset,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    OffsetTokensDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    RightPaddingMaskDataset,
+    RollDataset,
+    SortDataset,
+    StripTokenDataset,
+    data_utils,
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.tasks import FairseqDataclass, FairseqTask, register_task
+from fairseq.dataclass import ChoiceEnum
+
+
+logger = logging.getLogger(__name__)
+SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
+
+
+@dataclass
+class SentencePredictionConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "path to data directory"})
+    num_classes: int = field(
+        default=-1,
+        metadata={"help": "number of classes or regression targets"},
+    )
+    init_token: Optional[int] = field(
+        default=None,
+        metadata={"help": "add token at the beginning of each batch item"},
+    )
+    separator_token: Optional[int] = field(
+        default=None,
+        metadata={"help": "add separator token between inputs"},
+    )
+    no_shuffle: bool = field(
+        default=False,
+    )
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed tokens_per_sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+    add_prev_output_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "add prev_output_tokens to sample, used for encoder-decoder arch"
+        },
+    )
+    max_positions: int = field(
+        default=512,
+        metadata={"help": "max tokens per example"},
+    )
+
+    regression_target: bool = II("criterion.regression_target")
+    classification_head_name: str = II("criterion.classification_head_name")
+    seed: int = II("common.seed")
+
+    d2v2_multi: bool = field(
+        default=False,
+        metadata={"help": "prepare dataset for data2vec_multi"},
+    )
+
+
+@register_task("sentence_prediction", dataclass=SentencePredictionConfig)
+class SentencePredictionTask(FairseqTask):
+    """
+    Sentence (or sentence pair) prediction (classification or regression) task.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    def __init__(self, cfg, data_dictionary, label_dictionary):
+        super().__init__(cfg)
+        self.dictionary = data_dictionary
+        self._label_dictionary = label_dictionary
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol("<mask>")
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, cfg, **kwargs):
+        assert cfg.num_classes > 0, "Must set task.num_classes"
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            os.path.join(cfg.data, "input0", "dict.txt"),
+        )
+        logger.info("[input] dictionary: {} types".format(len(data_dict)))
+
+        # load label dictionary
+        if not cfg.regression_target:
+            label_dict = cls.load_dictionary(
+                os.path.join(cfg.data, "label", "dict.txt"),
+            )
+            logger.info("[label] dictionary: {} types".format(len(label_dict)))
+        else:
+            label_dict = data_dict
+        return cls(cfg, data_dict, label_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+
+        def get_path(key, split):
+            return os.path.join(self.cfg.data, key, split)
+
+        def make_dataset(key, dictionary):
+            split_path = get_path(key, split)
+
+            try:
+                dataset = data_utils.load_indexed_dataset(
+                    split_path,
+                    dictionary,
+                    combine=combine,
+                )
+            except Exception as e:
+                if "StorageException: [404] Path not found" in str(e):
+                    logger.warning(f"dataset {e} not found")
+                    dataset = None
+                else:
+                    raise e
+            return dataset
+
+        input0 = make_dataset("input0", self.source_dictionary)
+        assert input0 is not None, "could not find dataset: {}".format(
+            get_path("input0", split)
+        )
+        input1 = make_dataset("input1", self.source_dictionary)
+
+        if self.cfg.init_token is not None:
+            input0 = PrependTokenDataset(input0, self.cfg.init_token)
+
+        if input1 is None:
+            src_tokens = input0
+        else:
+            if self.cfg.separator_token is not None:
+                input1 = PrependTokenDataset(input1, self.cfg.separator_token)
+
+            src_tokens = ConcatSentencesDataset(input0, input1)
+
+        with data_utils.numpy_seed(self.cfg.seed):
+            shuffle = np.random.permutation(len(src_tokens))
+
+        src_tokens = maybe_shorten_dataset(
+            src_tokens,
+            split,
+            self.cfg.shorten_data_split_list,
+            self.cfg.shorten_method,
+            self.max_positions(),
+            self.cfg.seed,
+        )
+
+        if self.cfg.d2v2_multi:
+            net_input = {
+                "source": RightPadDataset(
+                    src_tokens,
+                    pad_idx=self.source_dictionary.pad(),
+                ),
+                "id": IdDataset(),
+                "padding_mask": RightPaddingMaskDataset(src_tokens),
+            }
+        else:
+            net_input = {
+                "src_tokens": RightPadDataset(
+                    src_tokens,
+                    pad_idx=self.source_dictionary.pad(),
+                ),
+                "src_lengths": NumelDataset(src_tokens, reduce=False),
+            }
+            if self.cfg.add_prev_output_tokens:
+                prev_tokens_dataset = RightPadDataset(
+                    RollDataset(src_tokens, 1),
+                    pad_idx=self.dictionary.pad(),
+                )
+                net_input.update(
+                    prev_output_tokens=prev_tokens_dataset,
+                )
+
+        dataset = {
+            "id": IdDataset(),
+            "net_input": net_input,
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(src_tokens, reduce=True),
+        }
+
+        if not self.cfg.regression_target:
+            label_dataset = make_dataset("label", self.label_dictionary)
+            if label_dataset is not None:
+                dataset.update(
+                    target=OffsetTokensDataset(
+                        StripTokenDataset(
+                            label_dataset,
+                            id_to_strip=self.label_dictionary.eos(),
+                        ),
+                        offset=-self.label_dictionary.nspecial,
+                    )
+                )
+        else:
+            label_path = "{0}.label".format(get_path("label", split))
+            if os.path.exists(label_path):
+
+                def parse_regression_target(i, line):
+                    values = line.split()
+                    assert (
+                        len(values) == self.cfg.num_classes
+                    ), f'expected num_classes={self.cfg.num_classes} regression target values on line {i}, found: "{line}"'
+                    return [float(x) for x in values]
+
+                with open(label_path) as h:
+                    dataset.update(
+                        target=RawLabelDataset(
+                            [
+                                parse_regression_target(i, line.strip())
+                                for i, line in enumerate(h.readlines())
+                            ]
+                        )
+                    )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[src_tokens.sizes],
+        )
+
+        if self.cfg.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, cfg, from_checkpoint=False):
+        from fairseq import models
+
+        with open_dict(cfg) if OmegaConf.is_config(cfg) else contextlib.ExitStack():
+            cfg.max_positions = self.cfg.max_positions
+
+        model = models.build_model(cfg, self, from_checkpoint)
+
+        model.register_classification_head(
+            self.cfg.classification_head_name,
+            num_classes=self.cfg.num_classes,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self.cfg.max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
+
+    @property
+    def label_dictionary(self):
+        return self._label_dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
new file mode 100644
index 0000000000..afe5569626
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import contextlib
+from omegaconf import open_dict, OmegaConf
+
+from fairseq.tasks import register_task
+from fairseq.tasks.sentence_prediction import (
+    SentencePredictionTask,
+    SentencePredictionConfig,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("sentence_prediction_adapters", dataclass=SentencePredictionConfig)
+class SentencePredictionAdapterTask(SentencePredictionTask):
+    def build_model(self, cfg):
+        from fairseq import models
+
+        with open_dict(cfg) if OmegaConf.is_config(cfg) else contextlib.ExitStack():
+            cfg.max_positions = self.cfg.max_positions
+
+        model = models.build_model(cfg, self)
+
+        model.register_classification_head(
+            self.cfg.classification_head_name,
+            num_classes=self.cfg.num_classes,
+        )
+
+        logger.info("Freezing Embedding Parameters")
+        for parameter in model.encoder.sentence_encoder.embed_positions.parameters():
+            parameter.requires_grad = False
+        for (
+            parameter
+        ) in model.encoder.sentence_encoder.layernorm_embedding.parameters():
+            parameter.requires_grad = False
+        for parameter in model.encoder.sentence_encoder.embed_tokens.parameters():
+            parameter.requires_grad = False
+
+        logger.info("Freezing Adapters")
+        for k, v in model.encoder.sentence_encoder.layers._modules.items():
+            logger.info("Freezing Adapters in Layer " + str(k))
+            if hasattr(v, "adapter_layer_norm"):
+                logger.info("Freezing Adapter LN")
+                for parameter in v.adapter_layer_norm.parameters():
+                    parameter.requires_grad = False
+            for parameter in v.adapter_modules.parameters():
+                parameter.requires_grad = False
+
+        return model
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
new file mode 100644
index 0000000000..57f63aab67
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
@@ -0,0 +1,219 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import numpy as np
+from fairseq import utils
+from fairseq.data import (
+    ConcatSentencesDataset,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    TruncateDataset,
+    data_utils,
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("sentence_ranking")
+class SentenceRankingTask(LegacyFairseqTask):
+    """
+    Ranking task on multiple sentences.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("data", metavar="FILE", help="file prefix for data")
+        parser.add_argument(
+            "--num-classes", type=int, help="number of sentences to be ranked"
+        )
+        parser.add_argument(
+            "--init-token",
+            type=int,
+            help="add token at the beginning of each batch item",
+        )
+        parser.add_argument(
+            "--separator-token", type=int, help="add separator token between inputs"
+        )
+        parser.add_argument("--no-shuffle", action="store_true")
+        parser.add_argument(
+            "--shorten-method",
+            default="none",
+            choices=["none", "truncate", "random_crop"],
+            help="if not none, shorten sequences that exceed --tokens-per-sample",
+        )
+        parser.add_argument(
+            "--shorten-data-split-list",
+            default="",
+            help="comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)',
+        )
+        parser.add_argument(
+            "--max-option-length", type=int, help="max length for each option"
+        )
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+
+    @classmethod
+    def load_dictionary(cls, args, filename, source=True):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol("<mask>")
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert (
+            args.criterion == "sentence_ranking"
+        ), "Must set --criterion=sentence_ranking"
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            args,
+            os.path.join(args.data, "input0", "dict.txt"),
+            source=True,
+        )
+        logger.info("[input] dictionary: {} types".format(len(data_dict)))
+        return SentenceRankingTask(args, data_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+
+        def get_path(type, split):
+            return os.path.join(self.args.data, type, split)
+
+        def make_dataset(type, dictionary):
+            split_path = get_path(type, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            return dataset
+
+        input0 = make_dataset("input0", self.source_dictionary)
+        input_options = [
+            make_dataset("input{idx}".format(idx=idx + 1), self.source_dictionary)
+            for idx in range(self.args.num_classes)
+        ]
+
+        if self.args.separator_token is not None:
+            input0 = PrependTokenDataset(input0, self.args.separator_token)
+
+        src_tokens = []
+        for input_option in input_options:
+            if self.args.init_token is not None:
+                input_option = PrependTokenDataset(input_option, self.args.init_token)
+            if self.args.max_option_length is not None:
+                input_option = TruncateDataset(
+                    input_option, self.args.max_option_length
+                )
+            src_token = ConcatSentencesDataset(input_option, input0)
+            src_token = maybe_shorten_dataset(
+                src_token,
+                split,
+                self.args.shorten_data_split_list,
+                self.args.shorten_method,
+                self.args.max_positions,
+                self.args.seed,
+            )
+            src_tokens.append(src_token)
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(src_tokens[0]))
+
+        dataset = {
+            "id": IdDataset(),
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(src_tokens[0], reduce=True),
+        }
+
+        for src_token_idx in range(len(src_tokens)):
+            dataset.update(
+                {
+                    "net_input{idx}".format(idx=src_token_idx + 1): {
+                        "src_tokens": RightPadDataset(
+                            src_tokens[src_token_idx],
+                            pad_idx=self.source_dictionary.pad(),
+                        ),
+                        "src_lengths": NumelDataset(
+                            src_tokens[src_token_idx], reduce=False
+                        ),
+                    }
+                }
+            )
+
+        label_path = "{}.label".format(get_path("label", split))
+        if os.path.exists(label_path):
+            with open(label_path) as h:
+                dataset.update(
+                    target=RawLabelDataset([int(x.strip()) for x in h.readlines()])
+                )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+
+        if self.args.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args, from_checkpoint=False):
+        from fairseq import models
+
+        model = models.build_model(args, self, from_checkpoint)
+
+        model.register_classification_head(
+            getattr(args, "ranking_head_name", "sentence_classification_head"),
+            num_classes=1,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self.args.max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
new file mode 100644
index 0000000000..9576b26801
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from fairseq.tasks import register_task
+from fairseq.tasks.speech_to_text import SpeechToTextTask
+from fairseq.tasks.translation import TranslationTask, TranslationConfig
+
+try:
+    import examples.simultaneous_translation  # noqa
+
+    import_successful = True
+except BaseException:
+    import_successful = False
+
+
+logger = logging.getLogger(__name__)
+
+
+def check_import(flag):
+    if not flag:
+        raise ImportError(
+            "'examples.simultaneous_translation' is not correctly imported. "
+            "Please considering `pip install -e $FAIRSEQ_DIR`."
+        )
+
+
+@register_task("simul_speech_to_text")
+class SimulSpeechToTextTask(SpeechToTextTask):
+    def __init__(self, args, tgt_dict):
+        check_import(import_successful)
+        super().__init__(args, tgt_dict)
+
+
+@register_task("simul_text_to_text", dataclass=TranslationConfig)
+class SimulTextToTextTask(TranslationTask):
+    def __init__(self, cfg, src_dict, tgt_dict):
+        check_import(import_successful)
+        super().__init__(cfg, src_dict, tgt_dict)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
new file mode 100644
index 0000000000..d746aa154c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
@@ -0,0 +1,243 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from omegaconf import II, MISSING
+
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    PadDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+    data_utils,
+)
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.data.span_mask_tokens_dataset import SpanMaskedTokensDataset
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+from ..data.indexed_dataset import get_available_dataset_impl
+
+logger = logging.getLogger(__name__)
+
+SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
+SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
+
+
+@dataclass
+class SpanMaskedLMConfig(FairseqDataclass):
+    shuffle: bool = field(
+        default=False,
+    )
+    noise_density: float = field(
+        default=0.15,
+        metadata={"help": "What fraction of the tokens to select as noise"},
+    )
+    mean_noise_span_length: float = field(
+        default=3,
+        metadata={"help": "Mean noise span length, must be >= 1"},
+    )
+    data: str = field(
+        default=MISSING,
+        metadata={
+            "help": "colon separated path to data directories list, "
+            "will be iterated upon during epochs in round-robin manner"
+        },
+    )
+    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
+        default="none",
+        metadata={
+            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.'
+        },
+    )
+    tokens_per_sample: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens per sample for LM dataset"},
+    )
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+    seed: int = II("common.seed")
+    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
+        "dataset.dataset_impl"
+    )
+    max_source_positions: int = field(
+        default=1024, metadata={"help": "max number of tokens in the source sequence"}
+    )
+    max_target_positions: int = field(
+        default=1024, metadata={"help": "max number of tokens in the target sequence"}
+    )
+    include_target_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "include target tokens in model input. this is used for data2vec"
+        },
+    )
+
+
+@register_task("span_masked_lm", dataclass=SpanMaskedLMConfig)
+class SpanMaskedLMTask(FairseqTask):
+    """
+    Span masked language modeling task. (ie. T5)
+    """
+
+    cfg: SpanMaskedLMConfig
+
+    def __init__(self, cfg, dictionary):
+        super().__init__(cfg)
+        self.dictionary = dictionary
+
+    @classmethod
+    def setup_task(cls, cfg: SpanMaskedLMConfig, **kwargs):
+        """Setup the task."""
+        paths = utils.split_paths(cfg.data)
+        assert len(paths) > 0
+        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
+        logger.info("dictionary: {} types".format(len(dictionary)))
+        if not hasattr(cfg, "shuffle"):
+            cfg.shuffle = False
+        return cls(cfg, dictionary)
+
+    def _load_dataset_split(self, split, epoch, combine):
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+        split_path = os.path.join(data_path, split)
+
+        dataset = data_utils.load_indexed_dataset(
+            split_path,
+            self.dictionary,
+            self.cfg.dataset_impl,
+            combine=combine,
+        )
+        if dataset is None:
+            raise FileNotFoundError(
+                "Dataset not found: {} ({})".format(split, split_path)
+            )
+
+        dataset = StripTokenDataset(dataset, self.dictionary.eos())
+
+        dataset = maybe_shorten_dataset(
+            dataset,
+            split,
+            self.cfg.shorten_data_split_list,
+            self.cfg.shorten_method,
+            self.cfg.tokens_per_sample,
+            self.cfg.seed,
+        )
+
+        # create continuous blocks of tokens
+        dataset = TokenBlockDataset(
+            dataset,
+            dataset.sizes,
+            self.cfg.tokens_per_sample - 2,  # one less for <s> and one for </s>
+            pad=self.dictionary.pad(),
+            eos=self.dictionary.eos(),
+            break_mode=self.cfg.sample_break_mode,
+            document_sep_len=0,
+        )
+        logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
+
+        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
+        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
+        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())
+        return dataset
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        dataset = self._load_dataset_split(split, epoch, combine)
+
+        self.datasets[split] = SpanMaskedTokensDataset(
+            dataset,
+            self.dictionary,
+            noise_density=self.cfg.noise_density,
+            mean_noise_span_length=self.cfg.mean_noise_span_length,
+            shuffle=self.cfg.shuffle,
+            seed=self.cfg.seed,
+        )
+        logger.info(
+            "Split: {0}, Loaded {1} samples of span_masked_tokens_dataset".format(
+                split,
+                len(self.datasets[split]),
+            )
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
+        """
+        Generate batches for inference. We assume that the input begins with a
+        bos symbol (`<s>`) and ends with an eos symbol (`</s>`).
+        """
+        pad = self.source_dictionary.pad()
+        eos = self.source_dictionary.eos()
+        src_dataset = TokenBlockDataset(
+            src_tokens,
+            src_lengths,
+            block_size=self.cfg.tokens_per_sample - 2,  # for <s> and </s>
+            pad=pad,
+            eos=eos,
+            break_mode=self.cfg.sample_break_mode,
+            document_sep_len=0,
+        )
+        prev_output_tokens = PrependTokenDataset(
+            StripTokenDataset(src_dataset, eos), eos
+        )
+        src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False)
+        return NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": src_dataset,
+                    "src_lengths": NumelDataset(src_dataset, reduce=False),
+                    "prev_output_tokens": PadDataset(
+                        prev_output_tokens, pad_idx=pad, left_pad=False
+                    ),
+                },
+                "target": src_dataset,
+            },
+            sizes=[np.array(src_lengths)],
+        )
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.cfg.max_source_positions, self.cfg.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary`."""
+        return self.dictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
new file mode 100644
index 0000000000..340732b928
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
@@ -0,0 +1,561 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+from collections import OrderedDict
+
+import numpy as np
+import torch
+from fairseq import utils
+from fairseq.data import (
+    AppendTokenDataset,
+    Dictionary,
+    IdDataset,
+    LMContextWindowDataset,
+    MonolingualDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    PadDataset,
+    PrependTokenDataset,
+    SpeechDLMDataset,
+    StripTokenDataset,
+    TokenBlockDataset,
+    TruncatedDictionary,
+    data_utils,
+)
+from fairseq.data.indexed_dataset import get_available_dataset_impl
+from fairseq.data.shorten_dataset import maybe_shorten_dataset
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.tasks import LegacyFairseqTask, register_task
+from omegaconf import II
+
+
+SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
+SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SpeechDLMConfig(FairseqDataclass):
+    data: Optional[str] = field(
+        default=None, metadata={"help": "path to data directory"}
+    )
+    channels: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": 'comma-separated list of channels to load e.g., "unitA,unitB"'
+            "(default: load all possible channels in the data path)"
+        },
+    )
+    channel_weights: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "comma-separated list of weights for different losses"
+            "(default: None, which means all losses are treated equally)"
+        },
+    )
+    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
+        default="none",
+        metadata={
+            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
+            'tokens. If set to "complete", splits samples only at the end '
+            "of sentence, but may include multiple sentences per sample. "
+            '"complete_doc" is similar but respects doc boundaries. '
+            'If set to "eos", includes only one sentence per sample.'
+        },
+    )
+    tokens_per_sample: int = field(
+        default=1024,
+        metadata={"help": "max number of tokens per sample for LM dataset"},
+    )
+    output_dictionary_size: int = field(
+        default=-1, metadata={"help": "limit the size of output dictionary"}
+    )
+    # str type is a workaround to put **default=True** here
+    next_unit_prediction: str = field(
+        default="False",
+        metadata={
+            "help": "Perform Next Unit Prediction, expected str input ('True' or 'False')"
+        },
+    )
+    edge_unit_prediction: str = field(
+        default="True",
+        metadata={
+            "help": "Perform Edge Unit Prediction, expected str input ('True' or 'False')"
+        },
+    )
+    duration_prediction: str = field(
+        default="True",
+        metadata={
+            "help": "Perform Duration Prediction, expected str input ('True' or 'False')"
+        },
+    )
+    delayed_duration_target: str = field(
+        default="True",
+        metadata={
+            "help": "Perform Delayed Duration Prediction, expected str input ('True' or 'False')"
+            "(default: 'True')"
+        },
+    )
+    max_target_durations: Optional[int] = field(
+        default=256,
+        metadata={"help": "max duration considered (cut off to this value)"},
+    )
+    add_bos_token: bool = field(
+        default=False, metadata={"help": "prepend beginning of sentence token (<s>)"}
+    )
+    max_target_positions: Optional[int] = field(
+        default=None, metadata={"help": "max number of tokens in the target sequence"}
+    )
+    shorten_method: SHORTEN_METHOD_CHOICES = field(
+        default="none",
+        metadata={
+            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
+        },
+    )
+    shorten_data_split_list: str = field(
+        default="",
+        metadata={
+            "help": "comma-separated list of dataset splits to apply shortening to, "
+            'e.g., "train,valid" (default: all dataset splits)'
+        },
+    )
+    # TODO common vars below add to parent
+    seed: int = II("common.seed")
+    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
+        "dataset.dataset_impl"
+    )
+    data_buffer_size: int = II("dataset.data_buffer_size")
+    tpu: bool = II("common.tpu")
+
+
+@register_task("speech_dlm_task", dataclass=SpeechDLMConfig)
+class SpeechDLMTask(LegacyFairseqTask):
+    """Task for the SpeechDLM model as described in the paper:
+    https://arxiv.org/pdf/2203.16502.pdf
+
+    It create a multi-channel dataset (SpeechDLMDataset) from multiple
+    dictionaries.
+
+    Args:
+        dictionaries (Dict[str, ~fairseq.data.Dictionary]): the dictionaries for
+            each input channel of the SpeechDLM model
+        output_dictionaries (Dict[str, ~fairseq.data.Dictionary]): the dictionaries
+            for the output of each channel of the SpeechDLM model. In most cases it
+            will be the same as *dictionaries*.
+        targets (List[str]): list of the target types that the SpeechDLM model
+            should predict.  Can be one of "next", "edge", "duration".
+            Defaults to "next".
+
+    .. note::
+
+        The SpeechDLM task is only compatible with
+        :mod:`fairseq-train` and :mod:`fairseq-validate`.
+        To generate new samples, please refer to example codes
+        at examples/textless_nlp/dgslm .
+    """
+
+    def __init__(self, args, dicts, output_dicts=None, targets=None):
+        super().__init__(args)
+        self.dicts = dicts
+        self.output_dicts = output_dicts or dicts
+
+        if targets is None:
+            targets = ["next"]
+        self.targets = targets
+
+        self.channels = list(dicts.keys())
+
+        if args.channel_weights is not None:
+            self.channel_weights = [float(w) for w in args.channel_weights.split(",")]
+        else:
+            self.channel_weights = [1.0 for _ in self.channels]
+        assert len(self.channel_weights) == len(
+            self.channels
+        ), "number of channel_weights must be the same as number of channels"
+
+        assert str(args.next_unit_prediction).lower() in [
+            "true",
+            "false",
+        ], f"Expected to be a string of boolean, found {args.next_unit_prediction}"
+        assert str(args.edge_unit_prediction).lower() in [
+            "true",
+            "false",
+        ], f"Expected to be a string of boolean, found {args.edge_unit_prediction}"
+        assert str(args.duration_prediction).lower() in [
+            "true",
+            "false",
+        ], f"Expected to be a string of boolean, found {args.duration_prediction}"
+        assert str(args.delayed_duration_target).lower() in [
+            "true",
+            "false",
+        ], f"Expected to be a string of boolean, found {args.delayed_duration_target}"
+        self.next_unit_prediction = bool(
+            str(args.next_unit_prediction).lower() == "true"
+        )
+        self.edge_unit_prediction = bool(
+            str(args.edge_unit_prediction).lower() == "true"
+        )
+        self.duration_prediction = bool(str(args.duration_prediction).lower() == "true")
+        self.delayed_duration_target = bool(
+            str(args.delayed_duration_target).lower() == "true"
+        )
+
+        self.max_target_durations = args.max_target_durations
+
+    @classmethod
+    def setup_dictionary(cls, args, **kwargs):
+        """The dictionaries will be a dict over channel keys and values of type
+        ~fairseq.data.Dictionary.
+        """
+        paths = utils.split_paths(args.data)
+        assert len(paths) > 0
+        data_path = paths[0]
+
+        dicts = None
+        output_dicts = None
+        if args.channels is None:
+            sorted_channels = sorted(
+                name[5:-4]
+                for name in os.listdir(data_path)
+                if name[:5] == "dict." and name[-4:] == ".txt"
+            )
+        else:
+            sorted_channels = sorted(args.channels.split(","))
+        logger.info("channels: {}".format(sorted_channels))
+        # load dictionaries
+        dicts = OrderedDict()
+        output_dicts = OrderedDict()
+        for channel in sorted_channels:
+            dictionary = Dictionary.load(
+                os.path.join(data_path, "dict.{}.txt".format(channel))
+            )
+            logger.info("[{}] dictionary: {} types".format(channel, len(dictionary)))
+            output_dictionary = dictionary
+            if args.output_dictionary_size >= 0:
+                output_dictionary = TruncatedDictionary(
+                    dictionary, args.output_dictionary_size
+                )
+            dicts[channel] = dictionary
+            output_dicts[channel] = output_dictionary
+            if len(dicts) > 0:
+                assert dicts[channel].pad() == dicts[sorted_channels[0]].pad()
+                assert dicts[channel].bos() == dicts[sorted_channels[0]].bos()
+                assert dicts[channel].eos() == dicts[sorted_channels[0]].eos()
+                assert dicts[channel].unk() == dicts[sorted_channels[0]].unk()
+        return (dicts, output_dicts)
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+        dicts, output_dicts = cls.setup_dictionary(args, **kwargs)
+
+        targets = []
+        if str(getattr(args, "next_unit_prediction", "false")).lower() == "true":
+            targets.append("next")
+        if str(getattr(args, "edge_unit_prediction", "false")).lower() == "true":
+            targets.append("edge")
+        if str(getattr(args, "duration_prediction", "false")).lower() == "true":
+            targets.append("duration")
+        if len(targets) == 0:
+            # standard language modeling
+            targets = ["next"]
+
+        return cls(args, dicts, output_dicts, targets=targets)
+
+    def build_model(self, args):
+        model = super().build_model(args)
+        for target in self.targets:
+            if target not in model.supported_targets:
+                raise ValueError("Unsupported SpeechDLM target: {}".format(target))
+        return model
+
+    def load_dataset(
+        self, split: str, epoch=1, combine=False, **kwargs
+    ) -> SpeechDLMDataset:
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        channel_datasets = {}
+        for channel in self.channels:
+            split_path = os.path.join(data_path, split + "." + channel)
+            dictionary = self.dicts[channel]
+            output_dictionary = self.output_dicts[channel]
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path, dictionary, self.args.dataset_impl, combine=combine
+            )
+
+            if dataset is None:
+                raise FileNotFoundError(
+                    "[{}] Dataset not found: {} ({})".format(channel, split, split_path)
+                )
+
+            dataset = maybe_shorten_dataset(
+                dataset,
+                split,
+                self.args.shorten_data_split_list,
+                self.args.shorten_method,
+                self.args.tokens_per_sample,
+                self.args.seed,
+            )
+
+            dataset = TokenBlockDataset(
+                dataset,
+                dataset.sizes,
+                self.args.tokens_per_sample,
+                pad=dictionary.pad(),
+                eos=dictionary.eos(),
+                break_mode=self.args.sample_break_mode,
+                include_targets=True,
+            )
+
+            add_eos_for_other_targets = (
+                self.args.sample_break_mode is not None
+                and self.args.sample_break_mode != "none"
+            )
+
+            channel_datasets[channel] = MonolingualDataset(
+                dataset=dataset,
+                sizes=dataset.sizes,
+                src_vocab=dictionary,
+                tgt_vocab=output_dictionary,
+                add_eos_for_other_targets=add_eos_for_other_targets,
+                shuffle=False,
+                targets=["future"],
+                add_bos_token=self.args.add_bos_token,
+            )
+
+        self.datasets[split] = SpeechDLMDataset(
+            datasets=channel_datasets,
+            targets=self.targets,
+            max_target_durations=self.max_target_durations,
+            shuffle=True,
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
+        """
+        Generate batches for inference. We prepend an eos token to src_tokens
+        (or bos if `--add-bos-token` is set) and we append a <pad> to target.
+        This is convenient both for generation with a prefix and LM scoring.
+        """
+        src_datasets = {}
+        tgt_datasets = {}
+        for channel in src_tokens[0]:
+            dataset = StripTokenDataset(
+                TokenBlockDataset(
+                    [src_tokens[i][channel] for i in range(len(src_tokens))],
+                    src_lengths,
+                    block_size=None,  # ignored for "eos" break mode
+                    pad=self.source_dictionaries[channel].pad(),
+                    eos=self.source_dictionaries[channel].eos(),
+                    break_mode="eos",
+                ),
+                # remove eos from (end of) target sequence
+                self.source_dictionaries[channel].eos(),
+            )
+            src_dataset = PrependTokenDataset(
+                dataset,
+                token=(
+                    self.source_dictionaries[channel].bos()
+                    if getattr(self.args, "add_bos_token", False)
+                    else self.source_dictionaries[channel].eos()
+                ),
+            )
+            tgt_dataset = AppendTokenDataset(
+                dataset, token=self.source_dictionaries[channel].pad()
+            )
+
+            src_datasets[channel] = src_dataset
+            tgt_datasets[channel] = tgt_dataset
+
+        return NestedDictionaryDataset(
+            {
+                "id": IdDataset(),
+                "net_input": {
+                    "src_tokens": OrderedDict(
+                        [
+                            (
+                                channel,
+                                PadDataset(
+                                    src_datasets[channel],
+                                    pad_idx=self.source_dictionaries[channel].pad(),
+                                    left_pad=False,
+                                ),
+                            )
+                            for channel in src_datasets
+                        ]
+                    ),
+                    "src_lengths": NumelDataset(
+                        next(iter(src_datasets.values())), reduce=False
+                    ),
+                },
+                "target": OrderedDict(
+                    [
+                        (
+                            channel,
+                            PadDataset(
+                                tgt_datasets[channel],
+                                pad_idx=self.source_dictionaries[channel].pad(),
+                                left_pad=False,
+                            ),
+                        )
+                        for channel in tgt_datasets
+                    ]
+                ),
+            },
+            sizes=[np.array(src_lengths)],
+        )
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            # Generation will always be conditioned on bos_token
+            if getattr(self.args, "add_bos_token", False):
+                bos_token = self.source_dictionary.bos()
+            else:
+                bos_token = self.source_dictionary.eos()
+
+            if constraints is not None:
+                raise NotImplementedError(
+                    "Constrained decoding with the SpeechDLM task is not supported"
+                )
+            # SequenceGenerator doesn't use src_tokens directly, we need to
+            # pass the `prefix_tokens` argument instead
+            if prefix_tokens is None:
+                prefix_tokens = {}
+                for channel in sample["net_input"]["src_tokens"]:
+                    if sample["net_input"]["src_tokens"][channel].nelement():
+                        prefix_tokens_channel = sample["net_input"]["src_tokens"][
+                            channel
+                        ]
+                        if prefix_tokens_channel[:, 0].eq(bos_token).all():
+                            prefix_tokens_channel = prefix_tokens_channel[:, 1:]
+                        prefix_tokens[channel] = prefix_tokens_channel
+                    else:
+                        prefix_tokens = None
+                        break
+            return generator.generate(
+                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
+            )
+
+    def eval_lm_dataloader(
+        self,
+        dataset,
+        max_tokens: Optional[int] = 36000,
+        batch_size: Optional[int] = None,
+        max_positions: Optional[int] = None,
+        num_shards: int = 1,
+        shard_id: int = 0,
+        num_workers: int = 1,
+        data_buffer_size: int = 10,
+        # ensures that every evaluated token has access to a context of at least
+        # this size, if possible
+        context_window: int = 0,
+    ):
+        if context_window > 0:
+            dataset = LMContextWindowDataset(
+                dataset=dataset,
+                tokens_per_sample=self.args.tokens_per_sample,
+                context_window=context_window,
+                pad_idx=self.source_dictionary.pad(),
+            )
+        return self.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=max_tokens,
+            max_sentences=batch_size,
+            max_positions=max_positions,
+            ignore_invalid_inputs=True,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            data_buffer_size=data_buffer_size,
+        ).next_epoch_itr(shuffle=False)
+
+    @property
+    def source_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.dicts[self.channels[0]]
+
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.output_dicts[self.channels[0]]
+
+    @property
+    def source_dictionaries(self):
+        """Return the dict of :class:`~fairseq.data.Dictionary` for the
+        multichannel language model."""
+        return self.dicts
+
+    @property
+    def target_dictionaries(self):
+        """Return the dict of :class:`~fairseq.data.Dictionary` for the
+        multichannel language model."""
+        return self.output_dicts
+
+    def build_generator(self, models, args, extra_gen_cls_kwargs=None):
+
+        from fairseq.models.speech_dlm.sequence_generator import (
+            multichannel_search,
+            MultichannelSequenceGenerator,
+        )
+
+        # Choose search strategy. Defaults to Beam Search.
+        sampling = getattr(args, "sampling", False)
+        sampling_topk = getattr(args, "sampling_topk", -1)
+        sampling_topp = getattr(args, "sampling_topp", -1.0)
+        assert (
+            sampling_topk < 0 or sampling
+        ), "--sampling-topk requires sampling (not beam search)"
+        assert (
+            sampling_topp < 0 or sampling
+        ), "--sampling-topp requires sampling (not beam search)"
+
+        if sampling:
+            search_strategy = multichannel_search.ContiguousMultichannelSampling(
+                self.target_dictionaries, sampling_topk, sampling_topp
+            )
+        else:
+            search_strategy = multichannel_search.ContiguousMultichannelBeamSearch(
+                self.target_dictionaries
+            )
+
+        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+
+        return MultichannelSequenceGenerator(
+            models,
+            self.target_dictionaries,
+            beam_size=getattr(args, "beam", 5),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 500),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            search_strategy=search_strategy,
+            duration_temperature=getattr(args, "duration_temperature", 1.0),
+            **extra_gen_cls_kwargs,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
new file mode 100644
index 0000000000..5aaaa95a90
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
@@ -0,0 +1,597 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import math
+from argparse import Namespace
+from pathlib import Path
+from typing import List
+
+import torch
+import torch.nn as nn
+
+from fairseq import utils
+from fairseq.data import Dictionary
+from fairseq.data.audio.data_cfg import MultitaskConfig, S2SDataConfig
+from fairseq.data.audio.speech_to_speech_dataset import SpeechToSpeechDatasetCreator
+from fairseq.data.audio.speech_to_text_dataset import (
+    SpeechToTextDataset,
+    TextTargetMultitaskData,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+from fairseq.tasks.speech_to_text import DummyMultiTask
+from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion
+
+logger = logging.getLogger(__name__)
+
+
+class StackUnitSequenceGenerator(nn.Module):
+    def __init__(self, tgt_dict, vocab_size):
+        super().__init__()
+        self.pad = tgt_dict.pad()
+        self.eos = tgt_dict.eos()
+        self.unk = tgt_dict.unk()
+        self.offset = len(tgt_dict) - vocab_size
+        self.vocab_size = vocab_size
+
+    def pack_units(self, input: torch.Tensor, n_frames_per_step) -> torch.Tensor:
+        if n_frames_per_step <= 1:
+            return input
+
+        bsz, _, n = input.shape
+        assert n == n_frames_per_step
+
+        scale = [
+            pow(self.vocab_size, n_frames_per_step - 1 - i)
+            for i in range(n_frames_per_step)
+        ]
+        scale = torch.LongTensor(scale).squeeze(0).to(input.device)
+        mask = input >= self.offset
+        res = ((input - self.offset) * scale * mask).sum(dim=2) + self.offset
+        return res
+
+    @torch.no_grad()
+    def generate(self, models, sample, **kwargs):
+        # currently only support viterbi search for stacked units
+        model = models[0]
+        model.eval()
+
+        max_len = model.max_decoder_positions()
+        # TODO: incorporate max_len_a and max_len_b
+
+        src_tokens = sample["net_input"]["src_tokens"]
+        src_lengths = sample["net_input"]["src_lengths"]
+        bsz, src_len, _ = src_tokens.size()
+        n_frames_per_step = model.decoder.n_frames_per_step
+
+        # initialize
+        encoder_out = model.forward_encoder(
+            src_tokens, src_lengths, speaker=sample["speaker"]
+        )
+        incremental_state = {}
+        pred_out, attn, scores = [], [], []
+        finished = src_tokens.new_zeros((bsz,)).bool()
+
+        prev_output_tokens = src_lengths.new_zeros((bsz, 1)).long().fill_(self.eos)
+        for _ in range(max_len):
+            cur_out, cur_extra = model.forward_decoder(
+                prev_output_tokens,
+                encoder_out=encoder_out,
+                incremental_state=incremental_state,
+            )
+
+            lprobs = model.get_normalized_probs([cur_out], log_probs=True)
+            # never select pad, unk
+            lprobs[:, :, self.pad] = -math.inf
+            lprobs[:, :, self.unk] = -math.inf
+
+            cur_pred_lprob, cur_pred_out = torch.max(lprobs, dim=2)
+            scores.append(cur_pred_lprob)
+            pred_out.append(cur_pred_out)
+
+            prev_output_tokens = torch.cat(
+                (
+                    prev_output_tokens,
+                    self.pack_units(
+                        cur_pred_out.view(bsz, 1, n_frames_per_step), n_frames_per_step
+                    ),
+                ),
+                dim=1,
+            )
+
+            attn.append(cur_extra["attn"][0])
+
+            cur_finished = torch.any(cur_pred_out.squeeze(1) == self.eos, dim=1)
+            finished = finished | cur_finished
+            if finished.sum().item() == bsz:
+                break
+
+        pred_out = torch.cat(pred_out, dim=1).view(bsz, -1)
+        attn = torch.cat(attn, dim=2)
+        alignment = attn.max(dim=1)[1]
+        attn = attn.repeat_interleave(n_frames_per_step, dim=2)
+        alignment = alignment.repeat_interleave(n_frames_per_step, dim=1)
+        scores = torch.cat(scores, dim=1)
+        eos_idx = (pred_out == self.eos).nonzero(as_tuple=True)
+        out_lens = src_lengths.new_zeros((bsz,)).long().fill_(max_len)
+        for b, l in zip(eos_idx[0], eos_idx[1]):
+            out_lens[b] = min(l, out_lens[b])
+
+        hypos = [
+            [
+                {
+                    "tokens": pred_out[b, :out_len],
+                    "attn": attn[b, :, :out_len],
+                    "alignment": alignment[b, :out_len],
+                    "positional_scores": scores[b, :out_len],
+                    "score": utils.item(scores[b, :out_len].sum().data),
+                }
+            ]
+            for b, out_len in zip(range(bsz), out_lens)
+        ]
+
+        return hypos
+
+
+@register_task("speech_to_speech")
+class SpeechToSpeechTask(LegacyFairseqTask):
+    @classmethod
+    def add_args(cls, parser):
+        parser.add_argument("data", help="manifest root path")
+        parser.add_argument(
+            "--config-yaml",
+            type=str,
+            default="config.yaml",
+            help="Configuration YAML filename (under manifest root)",
+        )
+        parser.add_argument(
+            "--multitask-config-yaml",
+            type=str,
+            default=None,
+            help="Configuration YAML filename for the multitasks (under manifest root)",
+        )
+        parser.add_argument(
+            "--max-source-positions",
+            default=6000,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the source sequence",
+        )
+        parser.add_argument(
+            "--max-target-positions",
+            default=1024,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the target sequence",
+        )
+        parser.add_argument(
+            "--target-is-code",
+            action="store_true",
+            help="set if target is discrete unit instead of spectrogram",
+        )
+        parser.add_argument(
+            "--target-code-size", type=int, default=None, help="# discrete units"
+        )
+        parser.add_argument(
+            "--n-frames-per-step",
+            type=int,
+            default=1,
+            help="# stacked frames, use 0 for reduced discrete unit sequence",
+        )
+        parser.add_argument("--eval-inference", action="store_true")
+        parser.add_argument(
+            "--eval-args",
+            type=str,
+            default="{}",
+            help='generation args for speech-to-unit model , e.g., \'{"beam": 5, "max_len_a": 1}\', as JSON string',
+        )
+        parser.add_argument("--eos-prob-threshold", type=float, default=0.5)
+        parser.add_argument(
+            "--mcd-normalize-type",
+            type=str,
+            default="targ",
+            choices=["targ", "pred", "path"],
+        )
+        parser.add_argument(
+            "--vocoder",
+            type=str,
+            default="griffin_lim",
+            choices=["griffin_lim", "hifigan", "code_hifigan"],
+        )
+        parser.add_argument("--spec-bwd-max-iter", type=int, default=8)
+        parser.add_argument(
+            "--infer-target-lang",
+            type=str,
+            default="",
+            help="target language for inference",
+        )
+
+    def __init__(self, args, tgt_dict, infer_tgt_lang_id=None):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+        self.data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml)
+
+        self.multitask_tasks = {}
+        self.tgt_dict_mt = None
+        self.eos_token_mt = None
+        if getattr(args, "multitask_config_yaml", None) is not None:
+            multitask_cfg = MultitaskConfig(
+                Path(args.data) / args.multitask_config_yaml
+            )
+            first_pass_task_idx = multitask_cfg.first_pass_decoder_task_index
+            for i, (task_name, task_config) in enumerate(
+                multitask_cfg.get_all_tasks().items()
+            ):
+                task_obj = DummyMultiTask(
+                    task_config,
+                    task_config.tgt_dict,
+                    first_pass=i == first_pass_task_idx,
+                )
+                self.multitask_tasks[task_name] = task_obj
+                if task_obj.is_first_pass_decoder:
+                    self.tgt_dict_mt = task_obj.target_dictionary
+                    if task_config.prepend_bos_and_append_tgt_lang_tag:
+                        self.eos_token_mt = task_config.eos_token
+                        assert not isinstance(self.eos_token_mt, List)
+
+                        if not self.eos_token_mt:
+                            raise Warning(
+                                "Please provide eos_token in --multitask-config-yaml to replace eos in sequence generator"
+                            )
+
+        self._infer_tgt_lang_id = infer_tgt_lang_id
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        data_cfg = data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml)
+        tgt_dict = None
+        infer_tgt_lang_id = None
+        if args.target_is_code:
+            if data_cfg.prepend_tgt_lang_tag_as_bos:
+                # dictionary with language tags
+                dict_path = Path(args.data) / data_cfg.vocab_filename
+                if not dict_path.is_file():
+                    raise FileNotFoundError(
+                        f"Dict has to be provided when setting prepend_tgt_lang_tag_as_bos: true, but dict not found: {dict_path}"
+                    )
+                tgt_dict = Dictionary.load(dict_path.as_posix())
+
+                # target langauge for inference
+                if args.infer_target_lang != "":
+                    tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format(
+                        args.infer_target_lang
+                    )
+                    infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag)
+                    assert infer_tgt_lang_id != tgt_dict.unk()
+            else:
+                assert args.target_code_size is not None
+
+                tgt_dict = Dictionary()
+                for i in range(args.target_code_size):
+                    tgt_dict.add_symbol(str(i))
+            logger.info(f"dictionary size: " f"{len(tgt_dict):,}")
+
+        if getattr(args, "train_subset", None) is not None:
+            if not all(s.startswith("train") for s in args.train_subset.split(",")):
+                raise ValueError('Train splits should be named like "train*".')
+
+        assert args.n_frames_per_step >= 1
+        assert (
+            not args.eval_inference
+            or (args.target_is_code and args.vocoder == "code_hifigan")
+            or (not args.target_is_code and args.vocoder != "code_hifigan")
+        )
+
+        return cls(args, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id)
+
+    def build_criterion(self, args):
+        from fairseq import criterions
+
+        if len(self.multitask_tasks) > 0:
+            if self.args.target_is_code and not args._name.startswith("speech_to_unit"):
+                raise ValueError(
+                    "set --criterion speech_to_unit for speech-to-unit loss with multitask"
+                )
+            elif not self.args.target_is_code and not args._name.startswith(
+                "speech_to_spectrogram"
+            ):
+                raise ValueError(
+                    "set --criterion speech_to_spectrogram for speech-to-spectrogram loss with multitask"
+                )
+
+        return criterions.build_criterion(args, self)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        self.datasets[split] = SpeechToSpeechDatasetCreator.from_tsv(
+            root=self.args.data,
+            data_cfg=self.data_cfg,
+            splits=split,
+            is_train_split=split.startswith("train"),
+            epoch=epoch,
+            seed=self.args.seed,
+            target_is_code=self.args.target_is_code,
+            tgt_dict=self.target_dictionary,
+            n_frames_per_step=self.args.n_frames_per_step,
+            multitask=self.multitask_tasks,
+        )
+
+    @property
+    def target_dictionary(self):
+        return self.tgt_dict
+
+    @property
+    def target_dictionary_mt(self):
+        return self.tgt_dict_mt
+
+    @property
+    def source_dictionary(self):
+        return None
+
+    def max_positions(self):
+        return self.args.max_source_positions, self.args.max_target_positions
+
+    def build_model(self, args, from_checkpoint=False):
+        args.input_feat_per_channel = self.data_cfg.input_feat_per_channel
+        args.input_channels = self.data_cfg.input_transformed_channels
+        args.target_speaker_embed = self.data_cfg.target_speaker_embed is not None
+        args.n_frames_per_step = self.args.n_frames_per_step
+
+        model = super().build_model(args, from_checkpoint)
+
+        if len(self.multitask_tasks) > 0:
+            from fairseq.models.speech_to_speech.s2s_transformer import (
+                S2STransformerMultitaskModelBase,
+            )
+
+            assert isinstance(model, S2STransformerMultitaskModelBase)
+
+        if self.args.eval_inference:
+            self.eval_gen_args = json.loads(self.args.eval_args)
+            self.generator = self.build_generator(
+                [model], Namespace(**self.eval_gen_args)
+            )
+
+        return model
+
+    def build_generator_dual_decoder(
+        self,
+        models,
+        args,
+        extra_gen_cls_kwargs=None,
+    ):
+        from examples.speech_to_speech.unity.sequence_generator_multi_decoder import (
+            MultiDecoderSequenceGenerator,
+        )
+
+        return MultiDecoderSequenceGenerator(
+            models,
+            self.target_dictionary,
+            self.target_dictionary_mt,
+            beam_size=max(1, getattr(args, "beam", 1)),
+            beam_size_mt=max(1, getattr(args, "beam_mt", 1)),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            max_len_a_mt=getattr(args, "max_len_a_mt", 0),
+            max_len_b_mt=getattr(args, "max_len_b_mt", 200),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            **extra_gen_cls_kwargs,
+        )
+
+    def build_generator(
+        self,
+        models,
+        args,
+        seq_gen_cls=None,
+        extra_gen_cls_kwargs=None,
+    ):
+
+        if not self.args.target_is_code or self.args.eval_inference:
+            from fairseq.models.text_to_speech.vocoder import get_vocoder
+
+            self.vocoder = get_vocoder(self.args, self.data_cfg)
+            self.vocoder = (
+                self.vocoder.cuda()
+                if torch.cuda.is_available() and not self.args.cpu
+                else self.vocoder.cpu()
+            )
+
+        has_dual_decoder = getattr(models[0], "mt_task_name", None) is not None
+
+        if self.args.target_is_code:
+            if self.args.n_frames_per_step == 1:
+                if has_dual_decoder:
+                    seq_generator = self.build_generator_dual_decoder(
+                        models,
+                        args,
+                        extra_gen_cls_kwargs=extra_gen_cls_kwargs,
+                    )
+                else:
+                    seq_generator = super().build_generator(
+                        models,
+                        args,
+                        seq_gen_cls=None,
+                        extra_gen_cls_kwargs=extra_gen_cls_kwargs,
+                    )
+            else:
+                assert (
+                    getattr(args, "beam", 1) == 1 and getattr(args, "nbest", 1) == 1
+                ), "only support viterbi search for stacked units"
+                seq_generator = StackUnitSequenceGenerator(
+                    self.tgt_dict,
+                    self.args.target_code_size,
+                )
+        else:
+            if has_dual_decoder:
+                if getattr(args, "teacher_forcing", False):
+                    raise NotImplementedError
+                else:
+                    from fairseq.speech_generator import MultiDecoderSpeechGenerator
+
+                    generator = MultiDecoderSpeechGenerator
+
+                lang_token_ids_aux = {
+                    i
+                    for s, i in self.tgt_dict_mt.indices.items()
+                    if TextTargetMultitaskData.is_lang_tag(s)
+                }
+
+                if extra_gen_cls_kwargs is None:
+                    extra_gen_cls_kwargs = {}
+                extra_gen_cls_kwargs[
+                    "symbols_to_strip_from_output"
+                ] = lang_token_ids_aux
+
+                eos_id_mt = (
+                    self.tgt_dict_mt.index(self.eos_token_mt)
+                    if self.eos_token_mt
+                    else None
+                )
+                assert eos_id_mt != self.tgt_dict_mt.unk()
+                extra_gen_cls_kwargs["eos_mt"] = eos_id_mt
+
+                seq_generator = generator(
+                    models,
+                    args,
+                    self.vocoder,
+                    self.data_cfg,
+                    self.target_dictionary_mt,
+                    max_iter=self.args.max_target_positions,
+                    eos_prob_threshold=self.args.eos_prob_threshold,
+                    **extra_gen_cls_kwargs,
+                )
+            else:
+                if getattr(args, "teacher_forcing", False):
+                    from fairseq.speech_generator import (
+                        TeacherForcingAutoRegressiveSpeechGenerator,
+                    )
+
+                    generator = TeacherForcingAutoRegressiveSpeechGenerator
+                    logger.info("Teacher forcing mode for generation")
+                else:
+                    from fairseq.speech_generator import AutoRegressiveSpeechGenerator
+
+                    generator = AutoRegressiveSpeechGenerator
+
+                seq_generator = generator(
+                    models[0],
+                    self.vocoder,
+                    self.data_cfg,
+                    max_iter=self.args.max_target_positions,
+                    eos_prob_threshold=self.args.eos_prob_threshold,
+                )
+
+        return seq_generator
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        for task_name, task_obj in self.multitask_tasks.items():
+            criterion.set_multitask_loss_weight(
+                task_name, task_obj.args.get_loss_weight(update_num)
+            )
+            if task_name in model.multitask_decoders:
+                model.multitask_decoders[task_name].train()
+
+        loss, sample_size, logging_output = super().train_step(
+            sample, model, criterion, optimizer, update_num, ignore_grad
+        )
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        for task_name in self.multitask_tasks.keys():
+            if task_name in model.multitask_decoders:
+                model.multitask_decoders[task_name].eval()
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+
+        if self.args.eval_inference:
+            hypos, inference_losses = self.valid_step_with_inference(
+                sample, model, self.generator
+            )
+            for k, v in inference_losses.items():
+                assert k not in logging_output
+                logging_output[k] = v
+
+        return loss, sample_size, logging_output
+
+    def valid_step_with_inference(self, sample, model, generator):
+        if self.args.target_is_code:
+            hypos = generator.generate([model], sample)
+            tgt_lens = (
+                sample["target_lengths"] - 1
+            ) * self.args.n_frames_per_step  # strip <eos>
+            for b, (f, l) in enumerate(zip(sample["target"], tgt_lens)):
+                hypos[b][0]["targ_waveform"] = self.vocoder(
+                    {"code": f[:l] - 4},  # remove <bos>, <pad>, <eos>, <unk>
+                    dur_prediction=self.eval_gen_args.get("dur_prediction", False),
+                )
+                if len(hypos[b][0]["tokens"]) > 0:
+                    hypos[b][0]["waveform"] = self.vocoder(
+                        {"code": hypos[b][0]["tokens"] - 4},
+                        dur_prediction=self.eval_gen_args.get("dur_prediction", False),
+                    )
+                else:
+                    hypos[b][0]["waveform"] = torch.flip(
+                        hypos[b][0]["targ_waveform"], dims=[0]
+                    )
+        else:
+            hypos = [
+                [hypo] for hypo in generator.generate(model, sample, has_targ=True)
+            ]
+
+        losses = {
+            "mcd_loss": 0.0,
+            "targ_frames": 0.0,
+            "pred_frames": 0.0,
+            "path_frames": 0.0,
+            "nins": 0.0,
+            "ndel": 0.0,
+        }
+        rets = batch_mel_cepstral_distortion(
+            [hypo[0]["targ_waveform"] for hypo in hypos],
+            [hypo[0]["waveform"] for hypo in hypos],
+            self.data_cfg.output_sample_rate,
+            normalize_type=None,
+        )
+        for d, extra in rets:
+            pathmap = extra[-1]
+            losses["mcd_loss"] += d.item()
+            losses["targ_frames"] += pathmap.size(0)
+            losses["pred_frames"] += pathmap.size(1)
+            losses["path_frames"] += pathmap.sum().item()
+            losses["nins"] += (pathmap.sum(dim=1) - 1).sum().item()
+            losses["ndel"] += (pathmap.sum(dim=0) - 1).sum().item()
+        losses["norm_frames"] = losses[
+            f"{getattr(self.args, 'mcd_normalize_type', 'targ')}_frames"
+        ]
+
+        return hypos, losses
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            if self._infer_tgt_lang_id is not None:
+                return generator.generate(
+                    models,
+                    sample,
+                    prefix_tokens=prefix_tokens,
+                    constraints=constraints,
+                    bos_token=self._infer_tgt_lang_id,
+                )
+            else:
+                return super().inference_step(
+                    generator,
+                    models,
+                    sample,
+                    prefix_tokens=prefix_tokens,
+                    constraints=constraints,
+                )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
new file mode 100644
index 0000000000..884082112a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
@@ -0,0 +1,350 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from argparse import Namespace
+from pathlib import Path
+from typing import List
+
+from fairseq.data import Dictionary, encoders
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from fairseq.data.audio.data_cfg import MultitaskConfig
+from fairseq.data.audio.speech_to_text_dataset import (
+    S2TDataConfig,
+    SpeechToTextDataset,
+    SpeechToTextDatasetCreator,
+    TextTargetMultitaskData,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("speech_to_text")
+class SpeechToTextTask(LegacyFairseqTask):
+    @classmethod
+    def add_args(cls, parser):
+        parser.add_argument("data", help="manifest root path")
+        parser.add_argument(
+            "--config-yaml",
+            type=str,
+            default="config.yaml",
+            help="Configuration YAML filename (under manifest root)",
+        )
+        parser.add_argument(
+            "--multitask-config-yaml",
+            type=str,
+            default=None,
+            help="Configuration YAML filename for the multitasks (under manifest root)",
+        )
+        parser.add_argument(
+            "--max-source-positions",
+            default=6000,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the source sequence",
+        )
+        parser.add_argument(
+            "--max-target-positions",
+            default=1024,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the target sequence",
+        )
+
+    def __init__(self, args, tgt_dict):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+        self.data_cfg = S2TDataConfig(Path(args.data) / args.config_yaml)
+        self.speaker_to_id = self._get_speaker_to_id()
+        if (
+            self.data_cfg.prepend_tgt_lang_tag
+            and self.data_cfg.prepend_bos_and_append_tgt_lang_tag
+        ):
+            raise ValueError(
+                "Please set only one of the two options to avoid adding target token multiple times"
+            )
+
+        self.multitask_tasks = {}
+        self.tgt_dict_mt = None
+        self.eos_token_mt = None
+        if getattr(args, "multitask_config_yaml", None) is not None:
+            multitask_cfg = MultitaskConfig(
+                Path(args.data) / args.multitask_config_yaml
+            )
+            first_pass_task_idx = multitask_cfg.first_pass_decoder_task_index
+            for i, (task_name, task_config) in enumerate(
+                multitask_cfg.get_all_tasks().items()
+            ):
+                task_obj = DummyMultiTask(
+                    task_config,
+                    task_config.tgt_dict,
+                    first_pass=i == first_pass_task_idx,
+                )
+                self.multitask_tasks[task_name] = task_obj
+                if task_obj.is_first_pass_decoder:
+                    self.tgt_dict_mt = task_obj.target_dictionary
+                    if task_config.prepend_bos_and_append_tgt_lang_tag:
+                        self.eos_token_mt = task_config.eos_token
+                        assert not isinstance(self.eos_token_mt, List)
+
+                        if not self.eos_token_mt:
+                            raise Warning(
+                                "Please provide eos_token in --multitask-config-yaml to replace eos in sequence generator"
+                            )
+
+    def _get_speaker_to_id(self):
+        speaker_to_id = None
+        speaker_set_filename = self.data_cfg.config.get("speaker_set_filename")
+        if speaker_set_filename is not None:
+            speaker_set_path = Path(self.args.data) / speaker_set_filename
+            with open(speaker_set_path) as f:
+                speaker_to_id = {r.strip(): i for i, r in enumerate(f)}
+        return speaker_to_id
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        data_cfg = S2TDataConfig(Path(args.data) / args.config_yaml)
+        dict_path = Path(args.data) / data_cfg.vocab_filename
+        if not dict_path.is_file():
+            raise FileNotFoundError(f"Dict not found: {dict_path.as_posix()}")
+        tgt_dict = Dictionary.load(dict_path.as_posix())
+        logger.info(
+            f"dictionary size ({data_cfg.vocab_filename}): " f"{len(tgt_dict):,}"
+        )
+
+        if getattr(args, "train_subset", None) is not None:
+            if not all(s.startswith("train") for s in args.train_subset.split(",")):
+                raise ValueError('Train splits should be named like "train*".')
+        return cls(args, tgt_dict)
+
+    def build_criterion(self, args):
+        from fairseq import criterions
+
+        if self.data_cfg.prepend_tgt_lang_tag and args.ignore_prefix_size != 1:
+            raise ValueError(
+                'Please set "--ignore-prefix-size 1" since '
+                "target language ID token is prepended as BOS."
+            )
+        return criterions.build_criterion(args, self)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        is_train_split = split.startswith("train")
+        pre_tokenizer = self.build_tokenizer(self.args)
+        bpe_tokenizer = self.build_bpe(self.args)
+        self.datasets[split] = SpeechToTextDatasetCreator.from_tsv(
+            root=self.args.data,
+            cfg=self.data_cfg,
+            splits=split,
+            tgt_dict=self.tgt_dict,
+            pre_tokenizer=pre_tokenizer,
+            bpe_tokenizer=bpe_tokenizer,
+            is_train_split=is_train_split,
+            epoch=epoch,
+            seed=self.args.seed,
+            speaker_to_id=self.speaker_to_id,
+            multitask=self.multitask_tasks,
+        )
+
+    @property
+    def target_dictionary(self):
+        return self.tgt_dict
+
+    @property
+    def target_dictionary_mt(self):
+        return self.tgt_dict_mt
+
+    @property
+    def source_dictionary(self):
+        return None
+
+    def max_positions(self):
+        return self.args.max_source_positions, self.args.max_target_positions
+
+    def build_model(self, args, from_checkpoint=False):
+        args.input_feat_per_channel = self.data_cfg.input_feat_per_channel
+        args.input_channels = self.data_cfg.input_channels
+        args.speaker_to_id = self.speaker_to_id
+        return super(SpeechToTextTask, self).build_model(args, from_checkpoint)
+
+    def build_generator_dual_decoder(
+        self,
+        models,
+        args,
+        extra_gen_cls_kwargs,
+    ):
+        from examples.speech_to_speech.unity.sequence_generator_multi_decoder import (
+            MultiDecoderSequenceGenerator,
+        )
+
+        lang_token_ids_aux = {
+            i
+            for s, i in self.tgt_dict_mt.indices.items()
+            if TextTargetMultitaskData.is_lang_tag(s)
+        }
+
+        extra_gen_cls_kwargs["symbols_to_strip_from_output"].update(lang_token_ids_aux)
+
+        eos_id_mt = (
+            self.tgt_dict_mt.index(self.eos_token_mt) if self.eos_token_mt else None
+        )
+        assert eos_id_mt != self.tgt_dict_mt.unk()
+        extra_gen_cls_kwargs["eos_mt"] = eos_id_mt
+
+        return MultiDecoderSequenceGenerator(
+            models,
+            self.target_dictionary,
+            self.target_dictionary_mt,
+            beam_size=max(1, getattr(args, "beam", 1)),
+            beam_size_mt=max(1, getattr(args, "beam_mt", 1)),
+            max_len_a=getattr(args, "max_len_a", 0),
+            max_len_b=getattr(args, "max_len_b", 200),
+            max_len_a_mt=getattr(args, "max_len_a_mt", 0),
+            max_len_b_mt=getattr(args, "max_len_b_mt", 0),
+            min_len=getattr(args, "min_len", 1),
+            normalize_scores=(not getattr(args, "unnormalized", False)),
+            len_penalty=getattr(args, "lenpen", 1),
+            len_penalty_mt=getattr(args, "lenpen_mt", 1),
+            unk_penalty=getattr(args, "unkpen", 0),
+            temperature=getattr(args, "temperature", 1.0),
+            match_source_len=getattr(args, "match_source_len", False),
+            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+            **extra_gen_cls_kwargs,
+        )
+
+    def build_generator(
+        self,
+        models,
+        args,
+        seq_gen_cls=None,
+        extra_gen_cls_kwargs=None,
+    ):
+        if self.data_cfg.prepend_tgt_lang_tag and args.prefix_size != 1:
+            raise ValueError(
+                'Please set "--prefix-size 1" since '
+                "target language ID token is prepended as BOS."
+            )
+        lang_token_ids = {
+            i
+            for s, i in self.tgt_dict.indices.items()
+            if SpeechToTextDataset.is_lang_tag(s)
+        }
+
+        if extra_gen_cls_kwargs is None:
+            extra_gen_cls_kwargs = {}
+        extra_gen_cls_kwargs["symbols_to_strip_from_output"] = lang_token_ids
+
+        eos_token = (
+            args.eos_token
+            if "eos_token" in args and args.eos_token is not None
+            else self.data_cfg.config.get("eos_token", None)
+        )
+
+        if self.data_cfg.prepend_bos_and_append_tgt_lang_tag and not eos_token:
+            raise Warning(
+                "Please provide --eos_token to replace eos in sequence generator"
+            )
+
+        eos_id = self.tgt_dict.index(eos_token) if eos_token else None
+        extra_gen_cls_kwargs["eos"] = eos_id
+
+        has_dual_decoder = getattr(models[0], "mt_task_name", None) is not None
+
+        if has_dual_decoder:
+            return self.build_generator_dual_decoder(
+                models,
+                args,
+                extra_gen_cls_kwargs=extra_gen_cls_kwargs,
+            )
+        else:
+            return super().build_generator(
+                models,
+                args,
+                seq_gen_cls=None,
+                extra_gen_cls_kwargs=extra_gen_cls_kwargs,
+            )
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        for task_name, task_obj in self.multitask_tasks.items():
+            criterion.set_multitask_loss_weight(
+                task_name, task_obj.args.get_loss_weight(update_num)
+            )
+            if task_name in model.multitask_decoders:
+                model.multitask_decoders[task_name].train()
+
+        loss, sample_size, logging_output = super().train_step(
+            sample, model, criterion, optimizer, update_num, ignore_grad
+        )
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        for task_name, task_obj in self.multitask_tasks.items():
+            if task_name in model.multitask_decoders:
+                model.multitask_decoders[task_name].eval()
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+
+        return loss, sample_size, logging_output
+
+    def build_tokenizer(self, args):
+        logger.info(f"pre-tokenizer: {self.data_cfg.pre_tokenizer}")
+        return encoders.build_tokenizer(Namespace(**self.data_cfg.pre_tokenizer))
+
+    def build_bpe(self, args):
+        logger.info(f"tokenizer: {self.data_cfg.bpe_tokenizer}")
+        return encoders.build_bpe(Namespace(**self.data_cfg.bpe_tokenizer))
+
+    def get_interactive_tokens_and_lengths(self, lines, encode_fn):
+        n_frames = [get_features_or_waveform(p).shape[0] for p in lines]
+        return lines, n_frames
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
+        return SpeechToTextDataset(
+            "interactive", False, self.data_cfg, src_tokens, src_lengths
+        )
+
+
+class DummyMultiTask(LegacyFairseqTask):
+    def __init__(self, args, tgt_dict, first_pass=False):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+        self.first_pass = first_pass
+
+    @property
+    def target_dictionary(self):
+        return self.tgt_dict
+
+    @property
+    def is_first_pass_decoder(self):
+        return self.first_pass
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        if self.args.decoder_type == "ctc":
+            model = models[0]  # only support single model
+            encoder_out = model(**sample)
+            if hasattr(model, "get_logits"):
+                emissions = model.get_logits(
+                    encoder_out
+                )  # no need to normalize emissions
+            else:
+                emissions = model.get_normalized_probs(encoder_out, log_probs=True)
+            return generator.decode(
+                emissions.transpose(0, 1).float().cpu().contiguous()
+            )
+        else:
+            raise NotImplementedError("only ctc decoder is supported at the moment")
+
+    def build_generator(
+        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None
+    ):
+        if self.args.decoder_type == "ctc":
+            from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
+
+            return W2lViterbiDecoder(args, self.tgt_dict)
+        else:
+            raise NotImplementedError("only ctc decoder is supported at the moment")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py
new file mode 100644
index 0000000000..b9d3019d50
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import logging
+import sys
+import torch
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple
+
+from fairseq.data import Dictionary
+from fairseq.data.codedataset import ExpressiveCodeDataConfig, CodeDataset
+from fairseq.dataclass.configs import FairseqDataclass
+from fairseq.tasks import register_task
+from fairseq.tasks.fairseq_task import FairseqTask
+from omegaconf import MISSING, DictConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class UnitDictionary(Dictionary):
+    """
+    A fixed-sized Dictionary that operates on integer-valued tokens
+    wth a trivial (identity) token <-> id mapping.
+    Special symbols (bos, eos, ...) have ids above n_units.
+    """
+
+    def __init__(
+        self,
+        *,  # begin keyword-only arguments
+        n_units,
+        bos="<s>",
+        pad="<pad>",
+        eos="</s>",
+        unk="<unk>",
+        extra_special_symbols=None,
+        clip=False,
+    ):
+        self.n_units = n_units
+        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
+        self.clip = clip
+
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        for i in range(n_units):
+            self.add_symbol(str(i))
+
+        self.bos_index = self.add_symbol(bos)
+        self.pad_index = self.add_symbol(pad)
+        self.eos_index = self.add_symbol(eos)
+        self.unk_index = self.add_symbol(unk)
+
+        if extra_special_symbols:
+            for s in extra_special_symbols:
+                self.add_symbol(s)
+        self.nspecial = len(self.symbols)
+
+    def encode_line(self, line, append_eos=True, prepend_bos=False) -> torch.IntTensor:
+        words = [int(x) for x in line.split()]
+        if self.clip:
+            words = [min(self.n_units - 1, word) for word in words]
+        if prepend_bos:
+            words = [self.bos_index] + words
+        if append_eos:
+            words.append(self.eos_index)
+        ids = torch.IntTensor(words)
+        return ids
+
+
+@dataclass
+class SpeechUnitModelingConfig(FairseqDataclass):
+    data: str = field(default=MISSING, metadata={"help": "Path to data config.json"})
+    max_token_duration: int = field(
+        default=20, metadata={"help": "all token durations are capped to this value"}
+    )
+    tokens_per_sample: int = field(
+        default=1024, metadata={"help": "tokens in a sample"}
+    )
+    max_target_positions: int = field(
+        default=1024, metadata={"help": "max target positions"}
+    )
+
+    # duration modeling
+    ignore_duration_input: bool = field(
+        default=False, metadata={"help": "whether token durations should be zeroed out"}
+    )
+    discrete_duration: bool = field(
+        default=False, metadata={"help": "treat duration as discrete variable"}
+    )
+    # F0 modeling
+    ignore_f0_input: bool = field(
+        default=False, metadata={"help": "whether F0 should be zeroed out"}
+    )
+    discrete_f0: bool = field(
+        default=False, metadata={"help": "load quantized f0. get bin from config"}
+    )
+    log_f0: bool = field(
+        default=False, metadata={"help": "whether f0 should be modeled in log space"}
+    )
+    normalize_f0_mean: bool = field(
+        default=False, metadata={"help": "whether normalize f0 by speaker mean"}
+    )
+    normalize_f0_std: bool = field(
+        default=False, metadata={"help": "whether normalize f0 by speaker stddev"}
+    )
+    interpolate_f0: bool = field(
+        default=False,
+        metadata={"help": "whether interpolate f0 for non-voiced segments"},
+    )
+
+    # input/output streams
+    stream_shifts: str = field(
+        default="0,0",
+        metadata={
+            "help": (
+                "comma-separated integer list denoting right-shift for "
+                "duration and pitch streams"
+            )
+        },
+    )
+
+
+@register_task("speech_unit_modeling", dataclass=SpeechUnitModelingConfig)
+class SpeechUnitLanguageModelingTask(FairseqTask):
+    def __init__(self, cfg: SpeechUnitModelingConfig) -> None:
+        super().__init__(cfg)
+        assert not self.cfg.normalize_f0_std or self.cfg.normalize_f0_mean
+
+        self.data_config = ExpressiveCodeDataConfig(cfg.data)
+        self._source_dictionary = self._target_dictionary = UnitDictionary(
+            n_units=self.data_config.n_units
+        )
+        self._source_duration_dictionary = self._target_duration_dictionary = (
+            UnitDictionary(n_units=self.cfg.max_token_duration + 1, clip=True)
+            if self.cfg.discrete_duration
+            else None
+        )
+        self._source_f0_dictionary = self._target_f0_dictionary = (
+            UnitDictionary(n_units=self.data_config.f0_vq_n_units)
+            if self.cfg.discrete_f0
+            else None
+        )
+
+        self._channel_names = ["token", "duration", "f0"]
+        self._channel_sizes = [
+            len(self.target_dictionary),
+            len(self.target_duration_dictionary) if self.cfg.discrete_duration else 1,
+            len(self.target_f0_dictionary) if self.cfg.discrete_f0 else 1,
+        ]
+
+    @property
+    def source_dictionary(self) -> Optional[Dictionary]:
+        return self._source_dictionary
+
+    @property
+    def source_duration_dictionary(self) -> Optional[Dictionary]:
+        return self._source_duration_dictionary
+
+    @property
+    def source_f0_dictionary(self) -> Optional[Dictionary]:
+        return self._source_f0_dictionary
+
+    @property
+    def channel_names(self) -> List[str]:
+        return self._channel_names
+
+    @property
+    def channel_sizes(self) -> List[int]:
+        return self._channel_sizes
+
+    @property
+    def dictionary(self) -> Optional[Dictionary]:
+        return self._source_dictionary
+
+    @property
+    def target_dictionary(self) -> Optional[Dictionary]:
+        return self._target_dictionary
+
+    @property
+    def target_duration_dictionary(self) -> Optional[Dictionary]:
+        return self._target_duration_dictionary
+
+    @property
+    def target_f0_dictionary(self) -> Optional[Dictionary]:
+        return self._target_f0_dictionary
+
+    @property
+    def dictionaries(self) -> List[Dictionary]:
+        return [self._dictionaries[l] for l in self.cfg.labels]
+
+    @classmethod
+    def setup_task(
+        cls, cfg: SpeechUnitModelingConfig, **kwargs
+    ) -> "SpeechUnitLanguageModelingTask":
+        return cls(cfg)
+
+    def load_dataset(self, split: str, **kwargs) -> None:
+        self.datasets[split] = CodeDataset(
+            manifest=self.data_config.manifests[split],
+            dictionary=self.source_dictionary,
+            dur_dictionary=self.source_duration_dictionary,
+            f0_dictionary=self.source_f0_dictionary,
+            config=self.data_config,
+            discrete_dur=self.cfg.discrete_duration,
+            discrete_f0=self.cfg.discrete_f0,
+            log_f0=self.cfg.log_f0,
+            normalize_f0_mean=self.cfg.normalize_f0_mean,
+            normalize_f0_std=self.cfg.normalize_f0_std,
+            interpolate_f0=self.cfg.interpolate_f0,
+            shifts=self.cfg.stream_shifts,
+        )
+
+    def max_positions(self) -> Tuple[int, int]:
+        return (sys.maxsize, sys.maxsize)
+
+    def build_criterion(self, cfg: DictConfig):
+        import fairseq.criterions
+
+        return fairseq.criterions.build_criterion(cfg, self)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
new file mode 100644
index 0000000000..82e7e6643a
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
@@ -0,0 +1,501 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import os.path as op
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from fairseq.data.audio.text_to_speech_dataset import TextToSpeechDatasetCreator
+from fairseq.tasks import register_task
+from fairseq.tasks.speech_to_text import SpeechToTextTask
+from fairseq.speech_generator import (
+    AutoRegressiveSpeechGenerator,
+    NonAutoregressiveSpeechGenerator,
+    TeacherForcingAutoRegressiveSpeechGenerator,
+)
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+try:
+    from tensorboardX import SummaryWriter
+except ImportError:
+    logger.info("Please install tensorboardX: pip install tensorboardX")
+    SummaryWriter = None
+
+
+@register_task("text_to_speech")
+class TextToSpeechTask(SpeechToTextTask):
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument("data", help="manifest root path")
+        parser.add_argument(
+            "--config-yaml",
+            type=str,
+            default="config.yaml",
+            help="Configuration YAML filename (under manifest root)",
+        )
+        parser.add_argument(
+            "--max-source-positions",
+            default=1024,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the source sequence",
+        )
+        parser.add_argument(
+            "--max-target-positions",
+            default=1200,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the target sequence",
+        )
+        parser.add_argument("--n-frames-per-step", type=int, default=1)
+        parser.add_argument("--eos-prob-threshold", type=float, default=0.5)
+        parser.add_argument("--eval-inference", action="store_true")
+        parser.add_argument("--eval-tb-nsample", type=int, default=8)
+        parser.add_argument("--vocoder", type=str, default="griffin_lim")
+        parser.add_argument("--spec-bwd-max-iter", type=int, default=8)
+
+    def __init__(self, args, src_dict):
+        super().__init__(args, src_dict)
+        self.src_dict = src_dict
+        self.sr = self.data_cfg.config.get("features").get("sample_rate")
+
+        self.tensorboard_writer = None
+        self.tensorboard_dir = ""
+        if args.tensorboard_logdir and SummaryWriter is not None:
+            self.tensorboard_dir = os.path.join(args.tensorboard_logdir, "valid_extra")
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        is_train_split = split.startswith("train")
+        pre_tokenizer = self.build_tokenizer(self.args)
+        bpe_tokenizer = self.build_bpe(self.args)
+        self.datasets[split] = TextToSpeechDatasetCreator.from_tsv(
+            self.args.data,
+            self.data_cfg,
+            split,
+            self.src_dict,
+            pre_tokenizer,
+            bpe_tokenizer,
+            is_train_split=is_train_split,
+            epoch=epoch,
+            seed=self.args.seed,
+            n_frames_per_step=self.args.n_frames_per_step,
+            speaker_to_id=self.speaker_to_id,
+        )
+
+    @property
+    def target_dictionary(self):
+        return None
+
+    @property
+    def source_dictionary(self):
+        return self.src_dict
+
+    def get_speaker_embeddings_path(self):
+        speaker_emb_path = None
+        if self.data_cfg.config.get("speaker_emb_filename") is not None:
+            speaker_emb_path = op.join(
+                self.args.data, self.data_cfg.config.get("speaker_emb_filename")
+            )
+        return speaker_emb_path
+
+    @classmethod
+    def get_speaker_embeddings(cls, args):
+        embed_speaker = None
+        if args.speaker_to_id is not None:
+            if args.speaker_emb_path is None:
+                embed_speaker = torch.nn.Embedding(
+                    len(args.speaker_to_id), args.speaker_embed_dim
+                )
+            else:
+                speaker_emb_mat = np.load(args.speaker_emb_path)
+                assert speaker_emb_mat.shape[1] == args.speaker_embed_dim
+                embed_speaker = torch.nn.Embedding.from_pretrained(
+                    torch.from_numpy(speaker_emb_mat),
+                    freeze=True,
+                )
+                logger.info(
+                    f"load speaker embeddings from {args.speaker_emb_path}. "
+                    f"train embedding? {embed_speaker.weight.requires_grad}\n"
+                    f"embeddings:\n{speaker_emb_mat}"
+                )
+        return embed_speaker
+
+    def build_model(self, cfg, from_checkpoint=False):
+        cfg.pitch_min = self.data_cfg.config["features"].get("pitch_min", None)
+        cfg.pitch_max = self.data_cfg.config["features"].get("pitch_max", None)
+        cfg.energy_min = self.data_cfg.config["features"].get("energy_min", None)
+        cfg.energy_max = self.data_cfg.config["features"].get("energy_max", None)
+        cfg.speaker_emb_path = self.get_speaker_embeddings_path()
+        model = super().build_model(cfg, from_checkpoint)
+        self.generator = None
+        if getattr(cfg, "eval_inference", False):
+            self.generator = self.build_generator([model], cfg)
+        return model
+
+    def build_generator(self, models, cfg, vocoder=None, **unused):
+        if vocoder is None:
+            vocoder = self.build_default_vocoder()
+        model = models[0]
+        if getattr(model, "NON_AUTOREGRESSIVE", False):
+            return NonAutoregressiveSpeechGenerator(model, vocoder, self.data_cfg)
+        else:
+            generator = AutoRegressiveSpeechGenerator
+            if getattr(cfg, "teacher_forcing", False):
+                generator = TeacherForcingAutoRegressiveSpeechGenerator
+                logger.info("Teacher forcing mode for generation")
+            return generator(
+                model,
+                vocoder,
+                self.data_cfg,
+                max_iter=self.args.max_target_positions,
+                eos_prob_threshold=self.args.eos_prob_threshold,
+            )
+
+    def build_default_vocoder(self):
+        from fairseq.models.text_to_speech.vocoder import get_vocoder
+
+        vocoder = get_vocoder(self.args, self.data_cfg)
+        if torch.cuda.is_available() and not self.args.cpu:
+            vocoder = vocoder.cuda()
+        else:
+            vocoder = vocoder.cpu()
+        return vocoder
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+
+        if getattr(self.args, "eval_inference", False):
+            hypos, inference_losses = self.valid_step_with_inference(
+                sample, model, self.generator
+            )
+            for k, v in inference_losses.items():
+                assert k not in logging_output
+                logging_output[k] = v
+
+            picked_id = 0
+            if self.tensorboard_dir and (sample["id"] == picked_id).any():
+                self.log_tensorboard(
+                    sample,
+                    hypos[: self.args.eval_tb_nsample],
+                    model._num_updates,
+                    is_na_model=getattr(model, "NON_AUTOREGRESSIVE", False),
+                )
+        return loss, sample_size, logging_output
+
+    def valid_step_with_inference(self, sample, model, generator):
+        hypos = generator.generate(model, sample, has_targ=True)
+
+        losses = {
+            "mcd_loss": 0.0,
+            "targ_frames": 0.0,
+            "pred_frames": 0.0,
+            "nins": 0.0,
+            "ndel": 0.0,
+        }
+        rets = batch_mel_cepstral_distortion(
+            [hypo["targ_waveform"] for hypo in hypos],
+            [hypo["waveform"] for hypo in hypos],
+            self.sr,
+            normalize_type=None,
+        )
+        for d, extra in rets:
+            pathmap = extra[-1]
+            losses["mcd_loss"] += d.item()
+            losses["targ_frames"] += pathmap.size(0)
+            losses["pred_frames"] += pathmap.size(1)
+            losses["nins"] += (pathmap.sum(dim=1) - 1).sum().item()
+            losses["ndel"] += (pathmap.sum(dim=0) - 1).sum().item()
+
+        return hypos, losses
+
+    def log_tensorboard(self, sample, hypos, num_updates, is_na_model=False):
+        if self.tensorboard_writer is None:
+            self.tensorboard_writer = SummaryWriter(self.tensorboard_dir)
+        tb_writer = self.tensorboard_writer
+        for b in range(len(hypos)):
+            idx = sample["id"][b]
+            text = sample["src_texts"][b]
+            targ = hypos[b]["targ_feature"]
+            pred = hypos[b]["feature"]
+            attn = hypos[b]["attn"]
+
+            if is_na_model:
+                data = plot_tts_output(
+                    [targ.transpose(0, 1), pred.transpose(0, 1)],
+                    [f"target (idx={idx})", "output"],
+                    attn,
+                    "alignment",
+                    ret_np=True,
+                    suptitle=text,
+                )
+            else:
+                eos_prob = hypos[b]["eos_prob"]
+                data = plot_tts_output(
+                    [targ.transpose(0, 1), pred.transpose(0, 1), attn],
+                    [f"target (idx={idx})", "output", "alignment"],
+                    eos_prob,
+                    "eos prob",
+                    ret_np=True,
+                    suptitle=text,
+                )
+
+            tb_writer.add_image(
+                f"inference_sample_{b}", data, num_updates, dataformats="HWC"
+            )
+
+            if hypos[b]["waveform"] is not None:
+                targ_wave = hypos[b]["targ_waveform"].detach().cpu().float()
+                pred_wave = hypos[b]["waveform"].detach().cpu().float()
+                tb_writer.add_audio(
+                    f"inference_targ_{b}", targ_wave, num_updates, sample_rate=self.sr
+                )
+                tb_writer.add_audio(
+                    f"inference_pred_{b}", pred_wave, num_updates, sample_rate=self.sr
+                )
+
+
+def save_figure_to_numpy(fig):
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    return data
+
+
+DEFAULT_V_MIN = np.log(1e-5)
+
+
+def plot_tts_output(
+    data_2d,
+    title_2d,
+    data_1d,
+    title_1d,
+    figsize=(24, 4),
+    v_min=DEFAULT_V_MIN,
+    v_max=3,
+    ret_np=False,
+    suptitle="",
+):
+    try:
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+    except ImportError:
+        raise ImportError("Please install Matplotlib: pip install matplotlib")
+
+    data_2d = [
+        x.detach().cpu().float().numpy() if isinstance(x, torch.Tensor) else x
+        for x in data_2d
+    ]
+    fig, axes = plt.subplots(1, len(data_2d) + 1, figsize=figsize)
+    if suptitle:
+        fig.suptitle(suptitle[:400])  # capped at 400 chars
+    axes = [axes] if len(data_2d) == 0 else axes
+    for ax, x, name in zip(axes, data_2d, title_2d):
+        ax.set_title(name)
+        divider = make_axes_locatable(ax)
+        cax = divider.append_axes("right", size="5%", pad=0.05)
+        im = ax.imshow(
+            x,
+            origin="lower",
+            aspect="auto",
+            vmin=max(x.min(), v_min),
+            vmax=min(x.max(), v_max),
+        )
+        fig.colorbar(im, cax=cax, orientation="vertical")
+
+    if isinstance(data_1d, torch.Tensor):
+        data_1d = data_1d.detach().cpu().numpy()
+    axes[-1].plot(data_1d)
+    axes[-1].set_title(title_1d)
+    plt.tight_layout()
+
+    if ret_np:
+        fig.canvas.draw()
+        data = save_figure_to_numpy(fig)
+        plt.close(fig)
+        return data
+
+
+def antidiag_indices(offset, min_i=0, max_i=None, min_j=0, max_j=None):
+    """
+    for a (3, 4) matrix with min_i=1, max_i=3, min_j=1, max_j=4, outputs
+
+    offset=2 (1, 1),
+    offset=3 (2, 1), (1, 2)
+    offset=4 (2, 2), (1, 3)
+    offset=5 (2, 3)
+
+    constraints:
+        i + j = offset
+        min_j <= j < max_j
+        min_i <= offset - j < max_i
+    """
+    if max_i is None:
+        max_i = offset + 1
+    if max_j is None:
+        max_j = offset + 1
+    min_j = max(min_j, offset - max_i + 1, 0)
+    max_j = min(max_j, offset - min_i + 1, offset + 1)
+    j = torch.arange(min_j, max_j)
+    i = offset - j
+    return torch.stack([i, j])
+
+
+def batch_dynamic_time_warping(distance, shapes=None):
+    """full batched DTW without any constraints
+
+    distance:  (batchsize, max_M, max_N) matrix
+    shapes: (batchsize,) vector specifying (M, N) for each entry
+    """
+    # ptr: 0=left, 1=up-left, 2=up
+    ptr2dij = {0: (0, -1), 1: (-1, -1), 2: (-1, 0)}
+
+    bsz, m, n = distance.size()
+    cumdist = torch.zeros_like(distance)
+    backptr = torch.zeros_like(distance).type(torch.int32) - 1
+
+    # initialize
+    cumdist[:, 0, :] = distance[:, 0, :].cumsum(dim=-1)
+    cumdist[:, :, 0] = distance[:, :, 0].cumsum(dim=-1)
+    backptr[:, 0, :] = 0
+    backptr[:, :, 0] = 2
+
+    # DP with optimized anti-diagonal parallelization, O(M+N) steps
+    for offset in range(2, m + n - 1):
+        ind = antidiag_indices(offset, 1, m, 1, n)
+        c = torch.stack(
+            [
+                cumdist[:, ind[0], ind[1] - 1],
+                cumdist[:, ind[0] - 1, ind[1] - 1],
+                cumdist[:, ind[0] - 1, ind[1]],
+            ],
+            dim=2,
+        )
+        v, b = c.min(axis=-1)
+        backptr[:, ind[0], ind[1]] = b.int()
+        cumdist[:, ind[0], ind[1]] = v + distance[:, ind[0], ind[1]]
+
+    # backtrace
+    pathmap = torch.zeros_like(backptr)
+    for b in range(bsz):
+        i = m - 1 if shapes is None else (shapes[b][0] - 1).item()
+        j = n - 1 if shapes is None else (shapes[b][1] - 1).item()
+        dtwpath = [(i, j)]
+        while (i != 0 or j != 0) and len(dtwpath) < 10000:
+            assert i >= 0 and j >= 0
+            di, dj = ptr2dij[backptr[b, i, j].item()]
+            i, j = i + di, j + dj
+            dtwpath.append((i, j))
+        dtwpath = dtwpath[::-1]
+        indices = torch.from_numpy(np.array(dtwpath))
+        pathmap[b, indices[:, 0], indices[:, 1]] = 1
+
+    return cumdist, backptr, pathmap
+
+
+def compute_l2_dist(x1, x2):
+    """compute an (m, n) L2 distance matrix from (m, d) and (n, d) matrices"""
+    return torch.cdist(x1.unsqueeze(0), x2.unsqueeze(0), p=2).squeeze(0).pow(2)
+
+
+def compute_rms_dist(x1, x2):
+    l2_dist = compute_l2_dist(x1, x2)
+    return (l2_dist / x1.size(1)).pow(0.5)
+
+
+def get_divisor(pathmap, normalize_type):
+    if normalize_type is None:
+        return 1
+    elif normalize_type == "len1":
+        return pathmap.size(0)
+    elif normalize_type == "len2":
+        return pathmap.size(1)
+    elif normalize_type == "path":
+        return pathmap.sum().item()
+    else:
+        raise ValueError(f"normalize_type {normalize_type} not supported")
+
+
+def batch_compute_distortion(y1, y2, sr, feat_fn, dist_fn, normalize_type):
+    d, s, x1, x2 = [], [], [], []
+    for cur_y1, cur_y2 in zip(y1, y2):
+        assert cur_y1.ndim == 1 and cur_y2.ndim == 1
+        cur_x1 = feat_fn(cur_y1)
+        cur_x2 = feat_fn(cur_y2)
+        x1.append(cur_x1)
+        x2.append(cur_x2)
+
+        cur_d = dist_fn(cur_x1, cur_x2)
+        d.append(cur_d)
+        s.append(d[-1].size())
+    max_m = max(ss[0] for ss in s)
+    max_n = max(ss[1] for ss in s)
+    d = torch.stack(
+        [F.pad(dd, (0, max_n - dd.size(1), 0, max_m - dd.size(0))) for dd in d]
+    )
+    s = torch.LongTensor(s).to(d.device)
+    cumdists, backptrs, pathmaps = batch_dynamic_time_warping(d, s)
+
+    rets = []
+    itr = zip(s, x1, x2, d, cumdists, backptrs, pathmaps)
+    for (m, n), cur_x1, cur_x2, dist, cumdist, backptr, pathmap in itr:
+        cumdist = cumdist[:m, :n]
+        backptr = backptr[:m, :n]
+        pathmap = pathmap[:m, :n]
+        divisor = get_divisor(pathmap, normalize_type)
+
+        distortion = cumdist[-1, -1] / divisor
+        ret = distortion, (cur_x1, cur_x2, dist, cumdist, backptr, pathmap)
+        rets.append(ret)
+    return rets
+
+
+def batch_mel_cepstral_distortion(y1, y2, sr, normalize_type="path", mfcc_fn=None):
+    """
+    https://arxiv.org/pdf/2011.03568.pdf
+
+    The root mean squared error computed on 13-dimensional MFCC using DTW for
+    alignment. MFCC features are computed from an 80-channel log-mel
+    spectrogram using a 50ms Hann window and hop of 12.5ms.
+
+    y1: list of waveforms
+    y2: list of waveforms
+    sr: sampling rate
+    """
+
+    try:
+        import torchaudio
+    except ImportError:
+        raise ImportError("Please install torchaudio: pip install torchaudio")
+
+    if mfcc_fn is None or mfcc_fn.sample_rate != sr:
+        melkwargs = {
+            "n_fft": int(0.05 * sr),
+            "win_length": int(0.05 * sr),
+            "hop_length": int(0.0125 * sr),
+            "f_min": 20,
+            "n_mels": 80,
+            "window_fn": torch.hann_window,
+        }
+        mfcc_fn = torchaudio.transforms.MFCC(
+            sr, n_mfcc=13, log_mels=True, melkwargs=melkwargs
+        ).to(y1[0].device)
+    return batch_compute_distortion(
+        y1,
+        y2,
+        sr,
+        lambda y: mfcc_fn(y).transpose(-1, -2),
+        compute_rms_dist,
+        normalize_type,
+    )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
new file mode 100644
index 0000000000..6897ebe116
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
@@ -0,0 +1,498 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+import itertools
+import json
+import logging
+import os
+from typing import Optional
+from argparse import Namespace
+from omegaconf import II
+
+import numpy as np
+from fairseq import utils
+from fairseq.logging import metrics
+from fairseq.data import (
+    AppendTokenDataset,
+    ConcatDataset,
+    LanguagePairDataset,
+    PrependTokenDataset,
+    StripTokenDataset,
+    TruncateDataset,
+    data_utils,
+    encoders,
+    indexed_dataset,
+)
+from fairseq.data.indexed_dataset import get_available_dataset_impl
+from fairseq.dataclass import ChoiceEnum, FairseqDataclass
+from fairseq.tasks import FairseqTask, register_task
+
+
+EVAL_BLEU_ORDER = 4
+
+
+logger = logging.getLogger(__name__)
+
+
+def load_langpair_dataset(
+    data_path,
+    split,
+    src,
+    src_dict,
+    tgt,
+    tgt_dict,
+    combine,
+    dataset_impl,
+    upsample_primary,
+    left_pad_source,
+    left_pad_target,
+    max_source_positions,
+    max_target_positions,
+    prepend_bos=False,
+    load_alignments=False,
+    truncate_source=False,
+    append_source_id=False,
+    num_buckets=0,
+    shuffle=True,
+    pad_to_multiple=1,
+    prepend_bos_src=None,
+):
+    def split_exists(split, src, tgt, lang, data_path):
+        filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang))
+        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
+
+    src_datasets = []
+    tgt_datasets = []
+
+    for k in itertools.count():
+        split_k = split + (str(k) if k > 0 else "")
+
+        # infer langcode
+        if split_exists(split_k, src, tgt, src, data_path):
+            prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt))
+        elif split_exists(split_k, tgt, src, src, data_path):
+            prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src))
+        else:
+            if k > 0:
+                break
+            else:
+                raise FileNotFoundError(
+                    "Dataset not found: {} ({})".format(split, data_path)
+                )
+
+        src_dataset = data_utils.load_indexed_dataset(
+            prefix + src, src_dict, dataset_impl
+        )
+        if truncate_source:
+            src_dataset = AppendTokenDataset(
+                TruncateDataset(
+                    StripTokenDataset(src_dataset, src_dict.eos()),
+                    max_source_positions - 1,
+                ),
+                src_dict.eos(),
+            )
+        src_datasets.append(src_dataset)
+
+        tgt_dataset = data_utils.load_indexed_dataset(
+            prefix + tgt, tgt_dict, dataset_impl
+        )
+        if tgt_dataset is not None:
+            tgt_datasets.append(tgt_dataset)
+
+        logger.info(
+            "{} {} {}-{} {} examples".format(
+                data_path, split_k, src, tgt, len(src_datasets[-1])
+            )
+        )
+
+        if not combine:
+            break
+
+    assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0
+
+    if len(src_datasets) == 1:
+        src_dataset = src_datasets[0]
+        tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None
+    else:
+        sample_ratios = [1] * len(src_datasets)
+        sample_ratios[0] = upsample_primary
+        src_dataset = ConcatDataset(src_datasets, sample_ratios)
+        if len(tgt_datasets) > 0:
+            tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
+        else:
+            tgt_dataset = None
+
+    if prepend_bos:
+        assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
+        src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
+        if tgt_dataset is not None:
+            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
+    elif prepend_bos_src is not None:
+        logger.info(f"prepending src bos: {prepend_bos_src}")
+        src_dataset = PrependTokenDataset(src_dataset, prepend_bos_src)
+
+    eos = None
+    if append_source_id:
+        src_dataset = AppendTokenDataset(
+            src_dataset, src_dict.index("[{}]".format(src))
+        )
+        if tgt_dataset is not None:
+            tgt_dataset = AppendTokenDataset(
+                tgt_dataset, tgt_dict.index("[{}]".format(tgt))
+            )
+        eos = tgt_dict.index("[{}]".format(tgt))
+
+    align_dataset = None
+    if load_alignments:
+        align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt))
+        if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
+            align_dataset = data_utils.load_indexed_dataset(
+                align_path, None, dataset_impl
+            )
+
+    tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None
+    return LanguagePairDataset(
+        src_dataset,
+        src_dataset.sizes,
+        src_dict,
+        tgt_dataset,
+        tgt_dataset_sizes,
+        tgt_dict,
+        left_pad_source=left_pad_source,
+        left_pad_target=left_pad_target,
+        align_dataset=align_dataset,
+        eos=eos,
+        num_buckets=num_buckets,
+        shuffle=shuffle,
+        pad_to_multiple=pad_to_multiple,
+    )
+
+
+@dataclass
+class TranslationConfig(FairseqDataclass):
+    data: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "colon separated path to data directories list, will be iterated upon during epochs "
+            "in round-robin manner; however, valid and test data are always in the first directory "
+            "to avoid the need for repeating them in all directories"
+        },
+    )
+    source_lang: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "source language",
+            "argparse_alias": "-s",
+        },
+    )
+    target_lang: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "target language",
+            "argparse_alias": "-t",
+        },
+    )
+    load_alignments: bool = field(
+        default=False, metadata={"help": "load the binarized alignments"}
+    )
+    left_pad_source: bool = field(
+        default=True, metadata={"help": "pad the source on the left"}
+    )
+    left_pad_target: bool = field(
+        default=False, metadata={"help": "pad the target on the left"}
+    )
+    max_source_positions: int = field(
+        default=1024, metadata={"help": "max number of tokens in the source sequence"}
+    )
+    max_target_positions: int = field(
+        default=1024, metadata={"help": "max number of tokens in the target sequence"}
+    )
+    upsample_primary: int = field(
+        default=-1, metadata={"help": "the amount of upsample primary dataset"}
+    )
+    truncate_source: bool = field(
+        default=False, metadata={"help": "truncate source to max-source-positions"}
+    )
+    num_batch_buckets: int = field(
+        default=0,
+        metadata={
+            "help": "if >0, then bucket source and target lengths into "
+            "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations"
+        },
+    )
+    train_subset: str = II("dataset.train_subset")
+    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
+        "dataset.dataset_impl"
+    )
+    required_seq_len_multiple: int = II("dataset.required_seq_len_multiple")
+
+    # options for reporting BLEU during validation
+    eval_bleu: bool = field(
+        default=False, metadata={"help": "evaluation with BLEU scores"}
+    )
+    eval_bleu_args: Optional[str] = field(
+        default="{}",
+        metadata={
+            "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
+        },
+    )
+    eval_bleu_detok: str = field(
+        default="space",
+        metadata={
+            "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; "
+            "use 'space' to disable detokenization; see fairseq.data.encoders for other options"
+        },
+    )
+    eval_bleu_detok_args: Optional[str] = field(
+        default="{}",
+        metadata={"help": "args for building the tokenizer, if needed, as JSON string"},
+    )
+    eval_tokenized_bleu: bool = field(
+        default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}
+    )
+    eval_bleu_remove_bpe: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "remove BPE before computing BLEU",
+            "argparse_const": "@@ ",
+        },
+    )
+    eval_bleu_print_samples: bool = field(
+        default=False, metadata={"help": "print sample generations during validation"}
+    )
+
+
+@register_task("translation", dataclass=TranslationConfig)
+class TranslationTask(FairseqTask):
+    """
+    Translate from one (source) language to another (target) language.
+
+    Args:
+        src_dict (~fairseq.data.Dictionary): dictionary for the source language
+        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
+
+    .. note::
+
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+    """
+
+    cfg: TranslationConfig
+
+    def __init__(self, cfg: TranslationConfig, src_dict, tgt_dict):
+        super().__init__(cfg)
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+
+    @classmethod
+    def setup_task(cls, cfg: TranslationConfig, **kwargs):
+        """Setup the task (e.g., load dictionaries).
+
+        Args:
+            args (argparse.Namespace): parsed command-line arguments
+        """
+
+        paths = utils.split_paths(cfg.data)
+        assert len(paths) > 0
+        # find language pair automatically
+        if cfg.source_lang is None or cfg.target_lang is None:
+            cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair(paths[0])
+        if cfg.source_lang is None or cfg.target_lang is None:
+            raise Exception(
+                "Could not infer language pair, please provide it explicitly"
+            )
+
+        # load dictionaries
+        src_dict = cls.load_dictionary(
+            os.path.join(paths[0], "dict.{}.txt".format(cfg.source_lang))
+        )
+        tgt_dict = cls.load_dictionary(
+            os.path.join(paths[0], "dict.{}.txt".format(cfg.target_lang))
+        )
+        assert src_dict.pad() == tgt_dict.pad()
+        assert src_dict.eos() == tgt_dict.eos()
+        assert src_dict.unk() == tgt_dict.unk()
+        logger.info("[{}] dictionary: {} types".format(cfg.source_lang, len(src_dict)))
+        logger.info("[{}] dictionary: {} types".format(cfg.target_lang, len(tgt_dict)))
+
+        return cls(cfg, src_dict, tgt_dict)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        if split != self.cfg.train_subset:
+            # if not training data set, use the first shard for valid and test
+            paths = paths[:1]
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        # infer langcode
+        src, tgt = self.cfg.source_lang, self.cfg.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path,
+            split,
+            src,
+            self.src_dict,
+            tgt,
+            self.tgt_dict,
+            combine=combine,
+            dataset_impl=self.cfg.dataset_impl,
+            upsample_primary=self.cfg.upsample_primary,
+            left_pad_source=self.cfg.left_pad_source,
+            left_pad_target=self.cfg.left_pad_target,
+            max_source_positions=self.cfg.max_source_positions,
+            max_target_positions=self.cfg.max_target_positions,
+            load_alignments=self.cfg.load_alignments,
+            truncate_source=self.cfg.truncate_source,
+            num_buckets=self.cfg.num_batch_buckets,
+            shuffle=(split != "test"),
+            pad_to_multiple=self.cfg.required_seq_len_multiple,
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
+        return LanguagePairDataset(
+            src_tokens,
+            src_lengths,
+            self.source_dictionary,
+            tgt_dict=self.target_dictionary,
+            constraints=constraints,
+        )
+
+    def build_model(self, cfg, from_checkpoint=False):
+        model = super().build_model(cfg, from_checkpoint)
+        if self.cfg.eval_bleu:
+            detok_args = json.loads(self.cfg.eval_bleu_detok_args)
+            self.tokenizer = encoders.build_tokenizer(
+                Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args)
+            )
+
+            gen_args = json.loads(self.cfg.eval_bleu_args)
+            self.sequence_generator = self.build_generator(
+                [model], Namespace(**gen_args)
+            )
+        return model
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        if self.cfg.eval_bleu:
+            bleu = self._inference_with_bleu(self.sequence_generator, sample, model)
+            logging_output["_bleu_sys_len"] = bleu.sys_len
+            logging_output["_bleu_ref_len"] = bleu.ref_len
+            # we split counts into separate entries so that they can be
+            # summed efficiently across workers using fast-stat-sync
+            assert len(bleu.counts) == EVAL_BLEU_ORDER
+            for i in range(EVAL_BLEU_ORDER):
+                logging_output["_bleu_counts_" + str(i)] = bleu.counts[i]
+                logging_output["_bleu_totals_" + str(i)] = bleu.totals[i]
+        return loss, sample_size, logging_output
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+        if self.cfg.eval_bleu:
+
+            def sum_logs(key):
+                import torch
+
+                result = sum(log.get(key, 0) for log in logging_outputs)
+                if torch.is_tensor(result):
+                    result = result.cpu()
+                return result
+
+            counts, totals = [], []
+            for i in range(EVAL_BLEU_ORDER):
+                counts.append(sum_logs("_bleu_counts_" + str(i)))
+                totals.append(sum_logs("_bleu_totals_" + str(i)))
+
+            if max(totals) > 0:
+                # log counts as numpy arrays -- log_scalar will sum them correctly
+                metrics.log_scalar("_bleu_counts", np.array(counts))
+                metrics.log_scalar("_bleu_totals", np.array(totals))
+                metrics.log_scalar("_bleu_sys_len", sum_logs("_bleu_sys_len"))
+                metrics.log_scalar("_bleu_ref_len", sum_logs("_bleu_ref_len"))
+
+                def compute_bleu(meters):
+                    import inspect
+
+                    try:
+                        from sacrebleu.metrics import BLEU
+
+                        comp_bleu = BLEU.compute_bleu
+                    except ImportError:
+                        # compatibility API for sacrebleu 1.x
+                        import sacrebleu
+
+                        comp_bleu = sacrebleu.compute_bleu
+
+                    fn_sig = inspect.getfullargspec(comp_bleu)[0]
+                    if "smooth_method" in fn_sig:
+                        smooth = {"smooth_method": "exp"}
+                    else:
+                        smooth = {"smooth": "exp"}
+                    bleu = comp_bleu(
+                        correct=meters["_bleu_counts"].sum,
+                        total=meters["_bleu_totals"].sum,
+                        sys_len=int(meters["_bleu_sys_len"].sum),
+                        ref_len=int(meters["_bleu_ref_len"].sum),
+                        **smooth,
+                    )
+                    return round(bleu.score, 2)
+
+                metrics.log_derived("bleu", compute_bleu)
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.cfg.max_source_positions, self.cfg.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary`."""
+        return self.src_dict
+
+    @property
+    def target_dictionary(self):
+        """Return the target :class:`~fairseq.data.Dictionary`."""
+        return self.tgt_dict
+
+    def _inference_with_bleu(self, generator, sample, model):
+        import sacrebleu
+
+        def decode(toks, escape_unk=False):
+            s = self.tgt_dict.string(
+                toks.int().cpu(),
+                self.cfg.eval_bleu_remove_bpe,
+                # The default unknown string in fairseq is `<unk>`, but
+                # this is tokenized by sacrebleu as `< unk >`, inflating
+                # BLEU scores. Instead, we use a somewhat more verbose
+                # alternative that is unlikely to appear in the real
+                # reference, but doesn't get split into multiple tokens.
+                unk_string=("UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"),
+            )
+            if self.tokenizer:
+                s = self.tokenizer.decode(s)
+            return s
+
+        gen_out = self.inference_step(generator, [model], sample, prefix_tokens=None)
+        hyps, refs = [], []
+        for i in range(len(gen_out)):
+            hyps.append(decode(gen_out[i][0]["tokens"]))
+            refs.append(
+                decode(
+                    utils.strip_pad(sample["target"][i], self.tgt_dict.pad()),
+                    escape_unk=True,  # don't count <unk> as matches to the hypo
+                )
+            )
+        if self.cfg.eval_bleu_print_samples:
+            logger.info("example hypothesis: " + hyps[0])
+            logger.info("example reference: " + refs[0])
+        if self.cfg.eval_tokenized_bleu:
+            return sacrebleu.corpus_bleu(hyps, [refs], tokenize="none")
+        else:
+            return sacrebleu.corpus_bleu(hyps, [refs])
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
new file mode 100644
index 0000000000..0fd7a5b29f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from fairseq import utils
+from fairseq.data import LanguagePairDataset
+
+from . import register_task
+from .translation import TranslationTask, load_langpair_dataset
+
+
+@register_task("translation_from_pretrained_bart")
+class TranslationFromPretrainedBARTTask(TranslationTask):
+    """
+    Translate from source language to target language with a model initialized with a multilingual pretrain.
+
+    Args:
+        src_dict (~fairseq.data.Dictionary): dictionary for the source language
+        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
+
+    .. note::
+
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+
+    The translation task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.translation_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        TranslationTask.add_args(parser)
+        parser.add_argument('--langs',  type=str, metavar='LANG',
+                            help='comma-separated list of monolingual language, '
+                                 'for example, "en,de,fr". These should match the '
+                                 'langs from pretraining (and be in the same order). '
+                                 'You should always add all pretraining language idx '
+                                 'during finetuning.')
+        parser.add_argument('--prepend-bos', action='store_true',
+                            help='prepend bos token to each sentence, which matches '
+                                 'mBART pretraining')
+        # fmt: on
+
+    def __init__(self, args, src_dict, tgt_dict):
+        super().__init__(args, src_dict, tgt_dict)
+        self.langs = args.langs.split(",")
+        for d in [src_dict, tgt_dict]:
+            for l in self.langs:
+                d.add_symbol("[{}]".format(l))
+            d.add_symbol("<mask>")
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.args.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        # infer langcode
+        src, tgt = self.args.source_lang, self.args.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path,
+            split,
+            src,
+            self.src_dict,
+            tgt,
+            self.tgt_dict,
+            combine=combine,
+            dataset_impl=self.args.dataset_impl,
+            upsample_primary=self.args.upsample_primary,
+            left_pad_source=self.args.left_pad_source,
+            left_pad_target=self.args.left_pad_target,
+            max_source_positions=getattr(self.args, "max_source_positions", 1024),
+            max_target_positions=getattr(self.args, "max_target_positions", 1024),
+            load_alignments=self.args.load_alignments,
+            prepend_bos=getattr(self.args, "prepend_bos", False),
+            append_source_id=True,
+        )
+
+    def build_generator(self, models, args, **unused):
+        if getattr(args, "score_reference", False):
+            from fairseq.sequence_scorer import SequenceScorer
+
+            return SequenceScorer(
+                self.target_dictionary,
+                eos=self.tgt_dict.index("[{}]".format(self.args.target_lang)),
+            )
+        else:
+            from fairseq.sequence_generator import SequenceGenerator
+
+            return SequenceGenerator(
+                models,
+                self.target_dictionary,
+                beam_size=getattr(args, "beam", 5),
+                max_len_a=getattr(args, "max_len_a", 0),
+                max_len_b=getattr(args, "max_len_b", 200),
+                min_len=getattr(args, "min_len", 1),
+                normalize_scores=(not getattr(args, "unnormalized", False)),
+                len_penalty=getattr(args, "lenpen", 1),
+                unk_penalty=getattr(args, "unkpen", 0),
+                temperature=getattr(args, "temperature", 1.0),
+                match_source_len=getattr(args, "match_source_len", False),
+                no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
+                eos=self.tgt_dict.index("[{}]".format(self.args.target_lang)),
+            )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
+        src_lang_id = self.source_dictionary.index("[{}]".format(self.args.source_lang))
+        source_tokens = []
+        for s_t in src_tokens:
+            s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
+            source_tokens.append(s_t)
+        dataset = LanguagePairDataset(
+            source_tokens,
+            src_lengths,
+            self.source_dictionary,
+            tgt_dict=self.target_dictionary,
+            constraints=constraints,
+        )
+        return dataset
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
new file mode 100644
index 0000000000..a05f289152
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
+from fairseq.tasks.translation import TranslationConfig, TranslationTask
+
+from . import register_task
+
+
+@dataclass
+class TranslationFromPretrainedXLMConfig(TranslationConfig):
+    pass
+
+
+@register_task(
+    "translation_from_pretrained_xlm", dataclass=TranslationFromPretrainedXLMConfig
+)
+class TranslationFromPretrainedXLMTask(TranslationTask):
+    """
+    Same as TranslationTask except use the MaskedLMDictionary class so that
+    we can load data that was binarized with the MaskedLMDictionary class.
+
+    This task should be used for the entire training pipeline when we want to
+    train an NMT model from a pretrained XLM checkpoint: binarizing NMT data,
+    training NMT with the pretrained XLM checkpoint, and subsequent evaluation
+    of that trained model.
+    """
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the masked LM dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        return MaskedLMDictionary.load(filename)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
new file mode 100644
index 0000000000..b45fecd1f4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+import torch
+from fairseq import utils
+from fairseq.data import LanguagePairDataset
+from fairseq.dataclass import ChoiceEnum
+from fairseq.tasks import register_task
+from fairseq.tasks.translation import (
+    TranslationConfig,
+    TranslationTask,
+    load_langpair_dataset,
+)
+from fairseq.utils import new_arange
+
+
+NOISE_CHOICES = ChoiceEnum(["random_delete", "random_mask", "no_noise", "full_mask"])
+
+
+@dataclass
+class TranslationLevenshteinConfig(TranslationConfig):
+    noise: NOISE_CHOICES = field(
+        default="random_delete",
+        metadata={"help": "type of noise"},
+    )
+
+
+@register_task("translation_lev", dataclass=TranslationLevenshteinConfig)
+class TranslationLevenshteinTask(TranslationTask):
+    """
+    Translation (Sequence Generation) task for Levenshtein Transformer
+    See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_.
+    """
+
+    cfg: TranslationLevenshteinConfig
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        paths = utils.split_paths(self.cfg.data)
+        assert len(paths) > 0
+        data_path = paths[(epoch - 1) % len(paths)]
+
+        # infer langcode
+        src, tgt = self.cfg.source_lang, self.cfg.target_lang
+
+        self.datasets[split] = load_langpair_dataset(
+            data_path,
+            split,
+            src,
+            self.src_dict,
+            tgt,
+            self.tgt_dict,
+            combine=combine,
+            dataset_impl=self.cfg.dataset_impl,
+            upsample_primary=self.cfg.upsample_primary,
+            left_pad_source=self.cfg.left_pad_source,
+            left_pad_target=self.cfg.left_pad_target,
+            max_source_positions=self.cfg.max_source_positions,
+            max_target_positions=self.cfg.max_target_positions,
+            prepend_bos=True,
+        )
+
+    def inject_noise(self, target_tokens):
+        def _random_delete(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+
+            max_len = target_tokens.size(1)
+            target_mask = target_tokens.eq(pad)
+            target_score = target_tokens.clone().float().uniform_()
+            target_score.masked_fill_(
+                target_tokens.eq(bos) | target_tokens.eq(eos), 0.0
+            )
+            target_score.masked_fill_(target_mask, 1)
+            target_score, target_rank = target_score.sort(1)
+            target_length = target_mask.size(1) - target_mask.float().sum(
+                1, keepdim=True
+            )
+
+            # do not delete <bos> and <eos> (we assign 0 score for them)
+            target_cutoff = (
+                2
+                + (
+                    (target_length - 2)
+                    * target_score.new_zeros(target_score.size(0), 1).uniform_()
+                ).long()
+            )
+            target_cutoff = target_score.sort(1)[1] >= target_cutoff
+
+            prev_target_tokens = (
+                target_tokens.gather(1, target_rank)
+                .masked_fill_(target_cutoff, pad)
+                .gather(1, target_rank.masked_fill_(target_cutoff, max_len).sort(1)[1])
+            )
+            prev_target_tokens = prev_target_tokens[
+                :, : prev_target_tokens.ne(pad).sum(1).max()
+            ]
+
+            return prev_target_tokens
+
+        def _random_mask(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+            unk = self.tgt_dict.unk()
+
+            target_masks = (
+                target_tokens.ne(pad) & target_tokens.ne(bos) & target_tokens.ne(eos)
+            )
+            target_score = target_tokens.clone().float().uniform_()
+            target_score.masked_fill_(~target_masks, 2.0)
+            target_length = target_masks.sum(1).float()
+            target_length = target_length * target_length.clone().uniform_()
+            target_length = target_length + 1  # make sure to mask at least one token.
+
+            _, target_rank = target_score.sort(1)
+            target_cutoff = new_arange(target_rank) < target_length[:, None].long()
+            prev_target_tokens = target_tokens.masked_fill(
+                target_cutoff.scatter(1, target_rank, target_cutoff), unk
+            )
+            return prev_target_tokens
+
+        def _full_mask(target_tokens):
+            pad = self.tgt_dict.pad()
+            bos = self.tgt_dict.bos()
+            eos = self.tgt_dict.eos()
+            unk = self.tgt_dict.unk()
+
+            target_mask = (
+                target_tokens.eq(bos) | target_tokens.eq(eos) | target_tokens.eq(pad)
+            )
+            return target_tokens.masked_fill(~target_mask, unk)
+
+        if self.cfg.noise == "random_delete":
+            return _random_delete(target_tokens)
+        elif self.cfg.noise == "random_mask":
+            return _random_mask(target_tokens)
+        elif self.cfg.noise == "full_mask":
+            return _full_mask(target_tokens)
+        elif self.cfg.noise == "no_noise":
+            return target_tokens
+        else:
+            raise NotImplementedError
+
+    def build_generator(self, models, args, **unused):
+        # add models input to match the API for SequenceGenerator
+        from fairseq.iterative_refinement_generator import IterativeRefinementGenerator
+
+        return IterativeRefinementGenerator(
+            self.target_dictionary,
+            eos_penalty=getattr(args, "iter_decode_eos_penalty", 0.0),
+            max_iter=getattr(args, "iter_decode_max_iter", 10),
+            beam_size=getattr(args, "iter_decode_with_beam", 1),
+            reranking=getattr(args, "iter_decode_with_external_reranker", False),
+            decoding_format=getattr(args, "decoding_format", None),
+            adaptive=not getattr(args, "iter_decode_force_max_iter", False),
+            retain_history=getattr(args, "retain_iter_history", False),
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
+        if constraints is not None:
+            # Though see Susanto et al. (ACL 2020): https://www.aclweb.org/anthology/2020.acl-main.325/
+            raise NotImplementedError(
+                "Constrained decoding with the translation_lev task is not supported"
+            )
+
+        return LanguagePairDataset(
+            src_tokens, src_lengths, self.source_dictionary, append_bos=True
+        )
+
+    def train_step(
+        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
+    ):
+        model.train()
+        sample["prev_target"] = self.inject_noise(sample["target"])
+        loss, sample_size, logging_output = criterion(model, sample)
+        if ignore_grad:
+            loss *= 0
+        optimizer.backward(loss)
+        return loss, sample_size, logging_output
+
+    def valid_step(self, sample, model, criterion):
+        model.eval()
+        with torch.no_grad():
+            sample["prev_target"] = self.inject_noise(sample["target"])
+            loss, sample_size, logging_output = criterion(model, sample)
+        return loss, sample_size, logging_output
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
new file mode 100644
index 0000000000..5db36a7c79
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
@@ -0,0 +1,441 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import datetime
+import logging
+import time
+
+import torch
+from fairseq.data import (
+    FairseqDataset,
+    LanguagePairDataset,
+    ListDataset,
+    data_utils,
+    iterators,
+)
+from fairseq.data.multilingual.multilingual_data_manager import (
+    MultilingualDatasetManager,
+)
+from fairseq.data.multilingual.sampling_method import SamplingMethod
+from fairseq.tasks import LegacyFairseqTask, register_task
+from fairseq.utils import FileContentsAction
+
+
+###
+def get_time_gap(s, e):
+    return (
+        datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s)
+    ).__str__()
+
+
+###
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_task("translation_multi_simple_epoch")
+class TranslationMultiSimpleEpochTask(LegacyFairseqTask):
+    """
+    Translate from one (source) language to another (target) language.
+
+    Args:
+        langs (List[str]): a list of languages that are being supported
+        dicts (Dict[str, fairseq.data.Dictionary]): mapping from supported languages to their dictionaries
+        training (bool): whether the task should be configured for training or not
+
+    .. note::
+
+        The translation task is compatible with :mod:`fairseq-train`,
+        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
+
+    The translation task provides the following additional command-line
+    arguments:
+
+    .. argparse::
+        :ref: fairseq.tasks.translation_parser
+        :prog:
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
+                            help='inference source language')
+        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
+                            help='inference target language')
+        parser.add_argument('--lang-pairs', default=None, metavar='PAIRS',
+                            help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr',
+                            action=FileContentsAction)
+        parser.add_argument('--keep-inference-langtok', action='store_true',
+                            help='keep language tokens in inference output (e.g. for analysis or debugging)')
+
+        SamplingMethod.add_arguments(parser)
+        MultilingualDatasetManager.add_args(parser)
+        # fmt: on
+
+    def __init__(self, args, langs, dicts, training):
+        super().__init__(args)
+        self.langs = langs
+        self.dicts = dicts
+        self.training = training
+        if training:
+            self.lang_pairs = args.lang_pairs
+        else:
+            self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)]
+        # eval_lang_pairs for multilingual translation is usually all of the
+        # lang_pairs. However for other multitask settings or when we want to
+        # optimize for certain languages we want to use a different subset. Thus
+        # the eval_lang_pairs class variable is provided for classes that extend
+        # this class.
+        self.eval_lang_pairs = self.lang_pairs
+        # model_lang_pairs will be used to build encoder-decoder model pairs in
+        # models.build_model(). This allows multitask type of sub-class can
+        # build models other than the input lang_pairs
+        self.model_lang_pairs = self.lang_pairs
+        self.source_langs = [d.split("-")[0] for d in self.lang_pairs]
+        self.target_langs = [d.split("-")[1] for d in self.lang_pairs]
+        self.check_dicts(self.dicts, self.source_langs, self.target_langs)
+
+        self.sampling_method = SamplingMethod.build_sampler(args, self)
+        self.data_manager = MultilingualDatasetManager.setup_data_manager(
+            args, self.lang_pairs, langs, dicts, self.sampling_method
+        )
+
+    def check_dicts(self, dicts, source_langs, target_langs):
+        if self.args.source_dict is not None or self.args.target_dict is not None:
+            # no need to check whether the source side and target side are sharing dictionaries
+            return
+        src_dict = dicts[source_langs[0]]
+        tgt_dict = dicts[target_langs[0]]
+        for src_lang in source_langs:
+            assert (
+                src_dict == dicts[src_lang]
+            ), "Diffrent dictionary are specified for different source languages; "
+            "TranslationMultiSimpleEpochTask only supports one shared dictionary across all source languages"
+        for tgt_lang in target_langs:
+            assert (
+                tgt_dict == dicts[tgt_lang]
+            ), "Diffrent dictionary are specified for different target languages; "
+            "TranslationMultiSimpleEpochTask only supports one shared dictionary across all target languages"
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        langs, dicts, training = MultilingualDatasetManager.prepare(
+            cls.load_dictionary, args, **kwargs
+        )
+        return cls(args, langs, dicts, training)
+
+    def has_sharded_data(self, split):
+        return self.data_manager.has_sharded_data(split)
+
+    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if split in self.datasets:
+            dataset = self.datasets[split]
+            if self.has_sharded_data(split):
+                if self.args.virtual_epoch_size is not None:
+                    if dataset.load_next_shard:
+                        shard_epoch = dataset.shard_epoch
+                    else:
+                        # no need to load next shard so skip loading
+                        # also this avoid always loading from beginning of the data
+                        return
+                else:
+                    shard_epoch = epoch
+        else:
+            # estimate the shard epoch from virtual data size and virtual epoch size
+            shard_epoch = self.data_manager.estimate_global_pass_epoch(epoch)
+        logger.info(f"loading data for {split} epoch={epoch}/{shard_epoch}")
+        logger.info(f"mem usage: {data_utils.get_mem_usage()}")
+        if split in self.datasets:
+            del self.datasets[split]
+            logger.info("old dataset deleted manually")
+            logger.info(f"mem usage: {data_utils.get_mem_usage()}")
+        self.datasets[split] = self.data_manager.load_dataset(
+            split,
+            self.training,
+            epoch=epoch,
+            combine=combine,
+            shard_epoch=shard_epoch,
+            **kwargs,
+        )
+
+    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
+        if constraints is not None:
+            raise NotImplementedError(
+                "Constrained decoding with the multilingual_translation task is not supported"
+            )
+
+        src_data = ListDataset(src_tokens, src_lengths)
+        dataset = LanguagePairDataset(src_data, src_lengths, self.source_dictionary)
+        src_langtok_spec, tgt_langtok_spec = self.args.langtoks["main"]
+        if self.args.lang_tok_replacing_bos_eos:
+            dataset = self.data_manager.alter_dataset_langtok(
+                dataset,
+                src_eos=self.source_dictionary.eos(),
+                src_lang=self.args.source_lang,
+                tgt_eos=self.target_dictionary.eos(),
+                tgt_lang=self.args.target_lang,
+                src_langtok_spec=src_langtok_spec,
+                tgt_langtok_spec=tgt_langtok_spec,
+            )
+        else:
+            dataset.src = self.data_manager.src_dataset_tranform_func(
+                self.args.source_lang,
+                self.args.target_lang,
+                dataset=dataset.src,
+                spec=src_langtok_spec,
+            )
+        return dataset
+
+    def build_generator(
+        self,
+        models,
+        args,
+        seq_gen_cls=None,
+        extra_gen_cls_kwargs=None,
+    ):
+        if not getattr(args, "keep_inference_langtok", False):
+            _, tgt_langtok_spec = self.args.langtoks["main"]
+            if tgt_langtok_spec:
+                tgt_lang_tok = self.data_manager.get_decoder_langtok(
+                    self.args.target_lang, tgt_langtok_spec
+                )
+                extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
+                extra_gen_cls_kwargs["symbols_to_strip_from_output"] = {tgt_lang_tok}
+
+        return super().build_generator(
+            models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs
+        )
+
+    def build_model(self, args, from_checkpoint=False):
+        return super().build_model(args, from_checkpoint)
+
+    def valid_step(self, sample, model, criterion):
+        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
+        return loss, sample_size, logging_output
+
+    def inference_step(
+        self, generator, models, sample, prefix_tokens=None, constraints=None
+    ):
+        with torch.no_grad():
+            _, tgt_langtok_spec = self.args.langtoks["main"]
+            if not self.args.lang_tok_replacing_bos_eos:
+                if prefix_tokens is None and tgt_langtok_spec:
+                    tgt_lang_tok = self.data_manager.get_decoder_langtok(
+                        self.args.target_lang, tgt_langtok_spec
+                    )
+                    src_tokens = sample["net_input"]["src_tokens"]
+                    bsz = src_tokens.size(0)
+                    prefix_tokens = (
+                        torch.LongTensor([[tgt_lang_tok]]).expand(bsz, 1).to(src_tokens)
+                    )
+                return generator.generate(
+                    models,
+                    sample,
+                    prefix_tokens=prefix_tokens,
+                    constraints=constraints,
+                )
+            else:
+                return generator.generate(
+                    models,
+                    sample,
+                    prefix_tokens=prefix_tokens,
+                    bos_token=self.data_manager.get_decoder_langtok(
+                        self.args.target_lang, tgt_langtok_spec
+                    )
+                    if tgt_langtok_spec
+                    else self.target_dictionary.eos(),
+                )
+
+    def reduce_metrics(self, logging_outputs, criterion):
+        super().reduce_metrics(logging_outputs, criterion)
+
+    def max_positions(self):
+        """Return the max sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)
+
+    @property
+    def source_dictionary(self):
+        return self.data_manager.get_source_dictionary(self.source_langs[0])
+
+    @property
+    def target_dictionary(self):
+        return self.data_manager.get_target_dictionary(self.target_langs[0])
+
+    def create_batch_sampler_func(
+        self,
+        max_positions,
+        ignore_invalid_inputs,
+        max_tokens,
+        max_sentences,
+        required_batch_size_multiple=1,
+        seed=1,
+    ):
+        def construct_batch_sampler(dataset, epoch):
+            splits = [
+                s for s, _ in self.datasets.items() if self.datasets[s] == dataset
+            ]
+            split = splits[0] if len(splits) > 0 else None
+            # NEW implementation
+            if epoch is not None:
+                # initialize the dataset with the correct starting epoch
+                dataset.set_epoch(epoch)
+
+            # get indices ordered by example size
+            start_time = time.time()
+            logger.info(f"start batch sampler: mem usage: {data_utils.get_mem_usage()}")
+
+            with data_utils.numpy_seed(seed):
+                indices = dataset.ordered_indices()
+            logger.info(
+                f"[{split}] @batch_sampler order indices time: {get_time_gap(start_time, time.time())}"
+            )
+            logger.info(f"mem usage: {data_utils.get_mem_usage()}")
+
+            # filter examples that are too large
+            if max_positions is not None:
+                my_time = time.time()
+                indices = self.filter_indices_by_size(
+                    indices, dataset, max_positions, ignore_invalid_inputs
+                )
+                logger.info(
+                    f"[{split}] @batch_sampler filter_by_size time: {get_time_gap(my_time, time.time())}"
+                )
+                logger.info(f"mem usage: {data_utils.get_mem_usage()}")
+
+            # create mini-batches with given size constraints
+            my_time = time.time()
+            batch_sampler = dataset.batch_by_size(
+                indices,
+                max_tokens=max_tokens,
+                max_sentences=max_sentences,
+                required_batch_size_multiple=required_batch_size_multiple,
+            )
+
+            logger.info(
+                f"[{split}] @batch_sampler batch_by_size time: {get_time_gap(my_time, time.time())}"
+            )
+            logger.info(
+                f"[{split}] per epoch batch_sampler set-up time: {get_time_gap(start_time, time.time())}"
+            )
+            logger.info(f"mem usage: {data_utils.get_mem_usage()}")
+
+            return batch_sampler
+
+        return construct_batch_sampler
+
+    # we need to override get_batch_iterator because we want to reset the epoch iterator each time
+    def get_batch_iterator(
+        self,
+        dataset,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        epoch=1,
+        data_buffer_size=0,
+        disable_iterator_cache=False,
+        skip_remainder_batch=False,
+        grouped_shuffling=False,
+        update_epoch_batch_itr=False,
+    ):
+        """
+        Get an iterator that yields batches of data from the given dataset.
+
+        Args:
+            dataset (~fairseq.data.FairseqDataset): dataset to batch
+            max_tokens (int, optional): max number of tokens in each batch
+                (default: None).
+            max_sentences (int, optional): max number of sentences in each
+                batch (default: None).
+            max_positions (optional): max sentence length supported by the
+                model (default: None).
+            ignore_invalid_inputs (bool, optional): don't raise Exception for
+                sentences that are too long (default: False).
+            required_batch_size_multiple (int, optional): require batch size to
+                be a multiple of N (default: 1).
+            seed (int, optional): seed for random number generator for
+                reproducibility (default: 1).
+            num_shards (int, optional): shard the data iterator into N
+                shards (default: 1).
+            shard_id (int, optional): which shard of the data iterator to
+                return (default: 0).
+            num_workers (int, optional): how many subprocesses to use for data
+                loading. 0 means the data will be loaded in the main process
+                (default: 0).
+            epoch (int, optional): the epoch to start the iterator from
+                (default: 0).
+            data_buffer_size (int, optional): number of batches to
+                preload (default: 0).
+            disable_iterator_cache (bool, optional): don't cache the
+                EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`)
+                (default: False).
+            grouped_shuffling (bool, optional): group batches with each groups
+                containing num_shards batches and shuffle groups. Reduces difference
+                between sequence lengths among workers for batches sorted by length.
+            update_epoch_batch_itr (bool optional): if true then donot use the cached
+                batch iterator for the epoch
+
+        Returns:
+            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
+                given dataset split
+        """
+        # initialize the dataset with the correct starting epoch
+        assert isinstance(dataset, FairseqDataset)
+        if dataset in self.dataset_to_epoch_iter:
+            return self.dataset_to_epoch_iter[dataset]
+        if self.args.sampling_method == "RoundRobin":
+            batch_iter = super().get_batch_iterator(
+                dataset,
+                max_tokens=max_tokens,
+                max_sentences=max_sentences,
+                max_positions=max_positions,
+                ignore_invalid_inputs=ignore_invalid_inputs,
+                required_batch_size_multiple=required_batch_size_multiple,
+                seed=seed,
+                num_shards=num_shards,
+                shard_id=shard_id,
+                num_workers=num_workers,
+                epoch=epoch,
+                data_buffer_size=data_buffer_size,
+                disable_iterator_cache=disable_iterator_cache,
+                skip_remainder_batch=skip_remainder_batch,
+                update_epoch_batch_itr=update_epoch_batch_itr,
+            )
+            self.dataset_to_epoch_iter[dataset] = batch_iter
+            return batch_iter
+
+        construct_batch_sampler = self.create_batch_sampler_func(
+            max_positions,
+            ignore_invalid_inputs,
+            max_tokens,
+            max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+            seed=seed,
+        )
+
+        epoch_iter = iterators.EpochBatchIterator(
+            dataset=dataset,
+            collate_fn=dataset.collater,
+            batch_sampler=construct_batch_sampler,
+            seed=seed,
+            num_shards=num_shards,
+            shard_id=shard_id,
+            num_workers=num_workers,
+            epoch=epoch,
+        )
+        return epoch_iter
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
new file mode 100644
index 0000000000..e708dc51bc
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
@@ -0,0 +1,506 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Implements tracking of constraints for a beam item.
+
+A list of constraints is given as a list of one or more token
+sequences, each of length at least one token. For example, for an input sentence
+
+> Die maschinelle Übersetzung ist schwer zu kontrollieren.
+
+We could have the constraints:
+* to influence
+* hard
+
+There are two implementations:
+* OrderedConstraintState: Tracks progress through an ordered list of multitoken constraints.
+* UnorderedConstraintState: Tracks progress through an unordered list of multitoken constraints.
+
+The difference is that in the first, the constraints are assumed to be
+in order; the algorithm will permit zero or more tokens between them.
+In the second, the constraints are not ordered, so many orderings will
+be explored.
+
+The same sequence can be present any number of times, and will appear
+that many times in the output.
+"""
+
+from collections import Counter
+from typing import List, Optional, Set, Tuple
+
+import torch
+
+
+class ConstraintState:
+    def __init__(self):
+        pass
+
+
+def pack_constraints(batch_constraints: List[List[torch.Tensor]]) -> torch.Tensor:
+    """Takes a list of list of constraints in tensor form (a list of
+    tensor constraints for each sentence) and transforms it into a
+    packed Tensor. For example, here is a batch of size 3 with 3, 0,
+    and 1 constraints:
+
+        [ [ [3 1 2], [3], [4 5 6 7], ]
+          [],
+          [ [1 8 9 10 1 4 11 12], ]
+        ]
+
+    Its corresponding packed structure is:
+
+        [ [ 3  3  1  2  0  3  0  4  5  6  7  0],
+          [ 0  0  0  0  0  0  0  0  0  0  0  0],
+          [ 1  1  8  9 10  1  4 11 12  0  0  0] ]
+
+    The packed tensor has shape (batch size, maxlen), where
+    maxlen is defined below. Each row contains concatenated
+    constraint tokens for that sentence, with 0 appended after
+    each constraint. The first item in each row is the number
+    of constraints for that sentence. So maxlen is the maximum
+    of
+
+    (number of constraints) + (sum length of constraints) + 1.
+
+    across all sentences in the batch.
+    """
+    # The maximum word length of concatenated constraints for any sentence
+    max_constraints_len = 1
+    for sentence_constraints in batch_constraints:
+        if len(sentence_constraints):
+            # number of constraints, plus sum of constrain lens, plus a zero after each
+            constraints_len = (
+                1
+                + sum([c.size(0) for c in sentence_constraints])
+                + len(sentence_constraints)
+            )
+            max_constraints_len = max(max_constraints_len, constraints_len)
+
+    batch_size = len(batch_constraints)
+    constraints_tensor = torch.zeros((batch_size, max_constraints_len)).long()
+    for i, sentence_constraints in enumerate(batch_constraints):
+        constraints_tensor[i, 0] = len(sentence_constraints)
+        offset = 1
+        for j, constraint in enumerate(sentence_constraints):
+            this_len = constraint.size(0)
+            constraints_tensor[i, offset : offset + this_len] = constraint
+            offset += this_len + 1
+
+    return constraints_tensor.long()
+
+
+def unpack_constraints(constraint_tensor: torch.Tensor) -> List[torch.Tensor]:
+    """
+    Transforms *one row* of a packed constraint tensor (e.g., for one
+    sentence in the batch) into a list of constraint tensors.
+    """
+    constraint_list = []
+    num_constraints = constraint_tensor[0]
+    constraints = constraint_tensor.tolist()
+    offset = 1
+    for i in range(num_constraints):
+        where = constraints.index(0, offset)
+        constraint_list.append(constraint_tensor[offset:where])
+        offset = where + 1
+
+    return constraint_list
+
+
+class ConstraintNode:
+    """
+    Represents a node in a trie managing unordered constraints.
+    """
+
+    def __init__(self, token: int = None, parent=None):
+        # The token associate with this node (None for the root)
+        self.token = int(token) if token is not None else None
+        # The parent (None at the root)
+        self.parent = parent
+        # Whether this node is a completed constraint
+        self.terminal = 0
+        # List of child nodes
+        self.children = {}
+
+        # The cumulative number of constraints from this point in the
+        # trie forward
+        self.num_constraints = 0
+
+    @property
+    def id(self):
+        return self.token
+
+    def __str__(self):
+        term = self.terminal != 0
+        return f"[{self.token}].{term}#{self.num_constraints}"
+
+    def __getitem__(self, key: int):
+        return self.children.get(key, None)
+
+    def next_tokens(self) -> Set[int]:
+        """The set of child labels."""
+        return set(self.children.keys())
+
+    @staticmethod
+    def create(constraints: List[List[int]]):
+        root = ConstraintNode()
+        for sequence in constraints:
+            root.add_sequence(sequence)
+
+        return root
+
+    @staticmethod
+    def print_graph(node: "ConstraintNode"):
+        if len(node.children) == 0:
+            return str(node)
+        else:
+            s = f"({node}"
+            for child in node.children.values():
+                s += " " + ConstraintNode.print_graph(child)
+            s += ")"
+            return s
+
+    def token_counts(self) -> Counter:
+        """Returns a counter of the number of times each token is used
+        in a constraint.
+        """
+        token_counts = Counter()
+        kids = list(self.children.values())
+        while len(kids) > 0:
+            kid = kids.pop()
+            token_counts[kid.id] += kid.num_constraints
+            kids += list(kid.children.values())
+
+        return token_counts
+
+    def tokens(self) -> Set[int]:
+        """Returns the set of tokens in constraints."""
+        return set(self.token_counts().keys())
+
+    def add_sequence(self, sequence: List[int]):
+        """Adds a constraint, represented as a list of integers, to
+        the trie."""
+        assert len(sequence) > 0
+
+        token = int(sequence[0])
+        if token not in self.children:
+            self.children[token] = ConstraintNode(token, parent=self)
+
+        node = self.children[token]
+        if len(sequence) == 1:
+            node.terminal += 1
+            node.num_constraints += 1
+            parent = node.parent
+            while parent is not None:
+                parent.num_constraints += 1
+                parent = parent.parent
+        else:
+            node.add_sequence(sequence[1:])
+
+
+class UnorderedConstraintState(ConstraintState):
+    """
+    Records progress through the set of constraints for each item in the beam
+    using a trie.
+    """
+
+    def __init__(self, node: ConstraintNode, copy_from: "ConstraintState" = None):
+        self.node = node
+
+        if copy_from is None:
+            # The root node
+            self.root = node
+            # The set of states in the graph that have been completed
+            self.completed = Counter()
+            # The...
+            self.generated = Counter()
+            # The list of tokens we need to generate
+            self.needed_tokens = self.root.tokens()
+        else:
+            self.completed = Counter(copy_from.completed)
+            self.generated = Counter(copy_from.generated)
+            self.root = copy_from.root
+
+        # Mark the node as generated
+        if self.node != self.root:
+            self.generated[node] += 1
+
+    @staticmethod
+    def create(constraint_tensor: torch.Tensor):
+        constraint_list = unpack_constraints(constraint_tensor)
+        constraint_trie_root = ConstraintNode.create(constraint_list)
+        return UnorderedConstraintState(constraint_trie_root)
+
+    def __str__(self):
+        gen_str = ",".join([str(node) for node in self.generated])
+        return f"{self.name}/{self.bank}({gen_str})x{self.num_completed}"
+
+    def __copy__(self):
+        copied_state = UnorderedConstraintState(self.node, copy_from=self)
+        return copied_state
+
+    def copy(self):
+        return self.__copy__()
+
+    @property
+    def name(self):
+        if self.node.id is None:
+            return "ROOT"
+        else:
+            return str(self.node.id)
+
+    @property
+    def is_root(self):
+        return self.node == self.root
+
+    @property
+    def bank(self):
+        return sum(self.generated.values())
+
+    @property
+    def num_completed(self):
+        """The number of constraints (not constraint tokens) that are completed.
+        In addition to the already-completed states, we need to account for the
+        current state, which might get marked as completed when another token
+        is generated.
+        """
+        in_final = self.node.terminal and self.completed[self.node] < self.node.terminal
+        return sum(self.completed.values()) + in_final
+
+    @property
+    def finished(self):
+        return self.root.num_constraints - self.num_completed == 0
+
+    @property
+    def token_counts(self):
+        return self.root.token_counts()
+
+    @property
+    def tokens(self):
+        return self.root.tokens()
+
+    @property
+    def num_constraint_tokens(self):
+        return sum(self.token_counts.values())
+
+    def next_tokens(self) -> Set[int]:
+        """Returns the list of tokens that could come next.
+        These are (a) all tokens extending the root state and, for
+        non-root states, additionally all tokens extending the current
+        state."""
+
+        if self.node != self.root:
+            return self.root.next_tokens().union(self.node.next_tokens())
+        else:
+            return self.root.next_tokens()
+
+    def advance(self, token: int):
+        """Reads in a token and advances the state. Here's how it works.
+
+        We can advance to the next state if:
+        - there is a matching child
+        - its path isn't blocked
+
+        A path is blocked when all constraints that are descendants of
+        that node have already been generated, in the current state.
+
+        If we are not able to advance from the current state, we "fall
+        off the graph" and return to the root state. There, we again
+        try to advance, checking the same criteria.
+
+        In any case, when falling off the graph, we need to do some
+        bookkeeping. We:
+        - check whether any constraints were met (all prefixes of
+          current state)
+        - if one is found, mark it as completed
+        - adjust visited nodes accordingly
+        """
+        token = int(token)
+
+        next_state = None
+        child = self.node[token]
+        if child is not None and self.generated[child] < child.num_constraints:
+            next_state = UnorderedConstraintState(child, copy_from=self)
+
+        def rewind():
+            """If we're mid-trie and an "illegal" token is chosen next, we need
+            to reset our state to the root state. However, along the way, we need
+            to check whether a prefix of the current trie state represents a state
+            we could mark as completed.
+            """
+            node = self.node
+            while node != self.root:
+                if node.terminal and self.completed[node] < node.terminal:
+                    next_state.completed[node] += 1
+                    return
+
+                next_state.generated[node] -= 1
+                node = node.parent
+
+        # Fall off the graph, check the root
+        if next_state is None and token in self.root.next_tokens():
+            child = self.root[token]
+            # We can only traverse this edge if it's not saturated
+            if self.generated[child] < child.num_constraints:
+                next_state = UnorderedConstraintState(child, copy_from=self)
+            else:
+                next_state = UnorderedConstraintState(self.root, copy_from=self)
+
+            # Rewind
+            rewind()
+
+        elif next_state is None:
+            next_state = UnorderedConstraintState(self.root, copy_from=self)
+            # Rewind
+            rewind()
+
+        return next_state
+
+
+class ConstraintSequence:
+    def __init__(self, sequences: List[List[int]]):
+        """Represents a set of possibly multitoken constraints by
+        concatenating them and internally recording the end points.
+        """
+        self.sequences = []
+        self.endpoints = []
+        self.num_tokens = 0
+        self.tokens = set()
+        for sequence in sequences:
+            for token in sequence:
+                self.tokens.add(token)
+            self.num_tokens += len(sequence)
+            self.endpoints += [False for x in range(len(sequence) - 1)] + [True]
+            self.sequences += sequence
+
+    def __getitem__(self, key: int):
+        return self.sequences[key]
+
+    def __len__(self):
+        return len(self.sequences)
+
+    def __str__(self):
+        return str(self.sequences)
+
+
+class OrderedConstraintState(ConstraintState):
+    """
+    Records progress through the set of linear nonbranching constraints with gaps.
+    """
+
+    def __init__(self, sequence: ConstraintSequence, state: int = -1):
+        self.sequence = sequence
+        self.state = state
+
+    @staticmethod
+    def create(constraint_tensor: torch.Tensor):
+        constraint_list = unpack_constraints(constraint_tensor)
+        return OrderedConstraintState(ConstraintSequence(constraint_list), -1)
+
+    def __str__(self):
+        return f"{self.state}/{self.bank}x{self.num_completed}"
+
+    def __copy__(self):
+        return OrderedConstraintState(self.sequence, self.state)
+
+    def copy(self):
+        return self.__copy__()
+
+    @property
+    def num_completed(self):
+        if self.state == -1:
+            return 0
+        count = len(
+            list(filter(lambda x: x, self.sequence.endpoints[0 : self.state + 1]))
+        )
+        return count
+
+    @property
+    def is_root(self):
+        return self.state == -1
+
+    @property
+    def name(self):
+        if self.state == -1:
+            return "ROOT"
+        else:
+            return str(self.sequence[self.state])
+
+    @property
+    def bank(self) -> int:
+        return self.state + 1
+
+    @property
+    def finished(self):
+        return self.state + 1 == len(self.sequence)
+
+    @property
+    def token_counts(self):
+        return self.sequence.token_counts()
+
+    @property
+    def tokens(self):
+        return self.sequence.tokens
+
+    @property
+    def num_constraint_tokens(self):
+        return sum(self.token_counts.values())
+
+    def next_tokens(self) -> Set[int]:
+        """Returns the list of tokens that could come next.
+        These are (a) all tokens extending the root state and, for
+        non-root states, additionally all tokens extending the current
+        state."""
+
+        tokens = set()
+        if self.state > 0:
+            tokens.add(self.sequence[0])
+        if not self.finished:
+            tokens.add(self.sequence[self.state + 1])
+        return tokens
+
+    def advance(self, token: int):
+        """Reads in a token and advances the state. Here's how it works.
+
+        We can advance to the next state if:
+        - there is a matching child
+        - its path isn't blocked
+
+        A path is blocked when all constraints that are descendants of
+        that node have already been generated, in the current state.
+
+        If we are not able to advance from the current state, we "fall
+        off the graph" and return to the root state. There, we again
+        try to advance, checking the same criteria.
+
+        In any case, when falling off the graph, we need to do some
+        bookkeeping. We:
+        - check whether any constraints were met (all prefixes of
+          current state)
+        - if one is found, mark it as completed
+        - adjust visited nodes accordingly
+        """
+        token = int(token)
+        # print(f"{self} ADVANCE({token}) {self.sequence} -> ", end="")
+
+        if self.finished:
+            # Accept anything
+            next_state = self.copy()
+
+        elif self.sequence[self.state + 1] == token:
+            # Advance to the next token
+            next_state = OrderedConstraintState(self.sequence, self.state + 1)
+
+        elif self.sequence.endpoints[self.state]:
+            # Accept anything between constraints (*)
+            next_state = self.copy()
+
+        elif token == self.sequence[0]:
+            # Start over having generated the first token
+            next_state = OrderedConstraintState(self.sequence, 0)
+        else:
+            # Start over from the root
+            next_state = OrderedConstraintState(self.sequence, -1)
+
+        return next_state
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
new file mode 100644
index 0000000000..42131f7b1d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+
+
+SPACE_NORMALIZER = re.compile(r"\s+")
+
+
+def tokenize_line(line):
+    line = SPACE_NORMALIZER.sub(" ", line)
+    line = line.strip()
+    return line.split()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
new file mode 100644
index 0000000000..16b1b91697
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
@@ -0,0 +1,1622 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Train a network across multiple GPUs.
+"""
+
+import contextlib
+import logging
+import os
+import sys
+import time
+from argparse import Namespace
+from itertools import chain
+from typing import Any, Dict, List
+
+import torch
+from omegaconf import OmegaConf
+
+from fairseq import checkpoint_utils, models, optim, utils
+from fairseq.dataclass.configs import FairseqConfig
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.distributed import utils as distributed_utils
+from fairseq.file_io import PathManager
+from fairseq.logging import meters, metrics
+from fairseq.models.ema import build_ema
+from fairseq.nan_detector import NanDetector
+from fairseq.optim import lr_scheduler
+from fairseq.utils import safe_hasattr
+
+logger = logging.getLogger(__name__)
+
+
+class Trainer(object):
+    """Main class for data parallel training.
+
+    This class supports synchronous distributed data parallel training,
+    where multiple workers each have a full model replica and gradients
+    are accumulated across workers before each update. We use
+    :class:`~torch.nn.parallel.DistributedDataParallel` to handle
+    communication of the gradients across workers.
+    """
+
+    def __init__(self, cfg: FairseqConfig, task, model, criterion, quantizer=None):
+
+        if isinstance(cfg, Namespace):
+            logger.warning(
+                "argparse.Namespace configuration is deprecated! Automatically converting to OmegaConf"
+            )
+            cfg = convert_namespace_to_omegaconf(cfg)
+
+        self.cfg = cfg
+        self.task = task
+
+        # catalog shared parameters
+        shared_params = _catalog_shared_params(model)
+        self.tpu = cfg.common.tpu
+        self.cuda = torch.cuda.is_available() and not cfg.common.cpu and not self.tpu
+        if self.cuda:
+            self.device = torch.device("cuda")
+        elif self.tpu:
+            self.device = utils.get_tpu_device()
+        else:
+            self.device = torch.device("cpu")
+
+        if self.is_fsdp:
+            import fairscale
+
+            if self.cfg.common.bf16:
+                raise ValueError(
+                    "FullyShardedDataParallel is not compatible with --bf16 or "
+                    "--memory-efficient-bf16"
+                )
+            if self.cfg.distributed_training.zero_sharding != "none":
+                raise ValueError(
+                    "FullyShardedDataParallel is not compatible with --zero-sharding "
+                    "option (it's already built in)"
+                )
+            if (
+                max(self.cfg.optimization.update_freq) > 1
+                and fairscale.__version__ < "0.4.0"
+            ):
+                raise RuntimeError(
+                    "Please update to fairscale 0.4.0 or newer when combining "
+                    "--update-freq with FullyShardedDataParallel"
+                )
+        else:
+            if (
+                hasattr(self.cfg.distributed_training, "cpu_offload")
+                and self.cfg.distributed_training.cpu_offload
+            ):
+                raise ValueError("--cpu-offload requires --ddp-backend=fully_sharded")
+
+        # copy model and criterion to current device/dtype
+        self._criterion = criterion
+        self._model = model
+        if not self.is_fsdp:
+            if cfg.common.fp16:
+                assert not cfg.common.amp, "Cannot use fp16 and AMP together"
+                self._criterion = self._criterion.half()
+                self._model = self._model.half()
+            elif cfg.common.bf16:
+                self._criterion = self._criterion.to(dtype=torch.bfloat16)
+                self._model = self._model.to(dtype=torch.bfloat16)
+            elif cfg.common.amp:
+                self._amp_retries = 0
+        if (
+            not cfg.distributed_training.pipeline_model_parallel
+            # the DistributedFairseqModel wrapper will handle moving to device,
+            # so only handle cases which don't use the wrapper
+            and not self.use_distributed_wrapper
+        ):
+            self._criterion = self._criterion.to(device=self.device)
+            self._model = self._model.to(device=self.device)
+        self.pipeline_model_parallel = cfg.distributed_training.pipeline_model_parallel
+        self.last_device = None
+        if self.cuda and self.pipeline_model_parallel:
+            self.last_device = torch.device(
+                cfg.distributed_training.pipeline_devices[-1]
+            )
+
+        # check that shared parameters are preserved after device transfer
+        for shared_param in shared_params:
+            ref = _get_module_by_path(self._model, shared_param[0])
+            for path in shared_param[1:]:
+                logger.info(
+                    "detected shared parameter: {} <- {}".format(shared_param[0], path)
+                )
+                _set_module_by_path(self._model, path, ref)
+
+        self._dummy_batch = None  # indicates we don't have a dummy batch at first
+        self._lr_scheduler = None
+        self._num_updates = 0
+        self._num_xla_compiles = 0  # for TPUs
+        self._optim_history = None
+        self._optimizer = None
+        self._warn_once = set()
+        self._wrapped_criterion = None
+        self._wrapped_model = None
+        self._ema = None
+
+        # TODO(myleott): support tpu
+        if self.cuda and self.data_parallel_world_size > 1:
+            self._grad_norm_buf = torch.cuda.DoubleTensor(self.data_parallel_world_size)
+        else:
+            self._grad_norm_buf = None
+
+        self.quantizer = quantizer
+        if self.quantizer is not None:
+            self.quantizer.set_trainer(self)
+
+        # get detailed cuda environment
+        if self.cuda:
+            self.cuda_env = utils.CudaEnvironment()
+            if self.data_parallel_world_size > 1:
+                self.cuda_env_arr = distributed_utils.all_gather_list(
+                    self.cuda_env, group=distributed_utils.get_global_group()
+                )
+            else:
+                self.cuda_env_arr = [self.cuda_env]
+            if self.data_parallel_rank == 0:
+                utils.CudaEnvironment.pretty_print_cuda_env_list(self.cuda_env_arr)
+        else:
+            self.cuda_env = None
+            self.cuda_env_arr = None
+
+        metrics.log_start_time("wall", priority=790, round=0)
+
+        self._start_time = time.time()
+        self._previous_training_time = 0
+        self._cumulative_training_time = None
+
+    def reinitialize(self):
+        """Reinitialize the Trainer, typically after model params change."""
+        self._lr_scheduler = None
+        self._optimizer = None
+        self._wrapped_criterion = None
+        self._wrapped_model = None
+
+    @property
+    def data_parallel_world_size(self):
+        if self.cfg.distributed_training.distributed_world_size == 1:
+            return 1
+        return distributed_utils.get_data_parallel_world_size()
+
+    @property
+    def data_parallel_process_group(self):
+        return distributed_utils.get_data_parallel_group()
+
+    @property
+    def data_parallel_rank(self):
+        if self.cfg.distributed_training.distributed_world_size == 1:
+            return 0
+        return distributed_utils.get_data_parallel_rank()
+
+    @property
+    def is_data_parallel_master(self):
+        # NOTE: this returns true for all model parallel replicas with data
+        # parallel rank 0
+        return self.data_parallel_rank == 0
+
+    @property
+    def use_distributed_wrapper(self) -> bool:
+        return (
+            self.data_parallel_world_size > 1 and not self.cfg.optimization.use_bmuf
+        ) or (self.is_fsdp and self.cfg.distributed_training.cpu_offload)
+
+    @property
+    def should_save_checkpoint_on_current_rank(self) -> bool:
+        """Indicates whether to save checkpoints on the current DDP rank."""
+        if (
+            self.is_fsdp and self.cfg.distributed_training.use_sharded_state
+        ) or getattr(self.cfg.model, "base_layers", 0) > 0:
+            return True
+        else:
+            return self.is_data_parallel_master
+
+    @property
+    def always_call_state_dict_during_save_checkpoint(self) -> bool:
+        if self.is_fsdp and not self.cfg.distributed_training.use_sharded_state:
+            # FSDP calls communication collective when consolidating checkpoints
+            return True
+        else:
+            return False
+
+    @property
+    def checkpoint_suffix(self) -> str:
+        """Suffix to add to the checkpoint file name."""
+        if self.is_fsdp and self.cfg.distributed_training.use_sharded_state:
+            return self.cfg.checkpoint.checkpoint_suffix + "-shard{0}".format(
+                self.data_parallel_rank
+            )
+        else:
+            return self.cfg.checkpoint.checkpoint_suffix or ""
+
+    @property
+    def criterion(self):
+        if self._wrapped_criterion is None:
+            if utils.has_parameters(self._criterion) and self.use_distributed_wrapper:
+                self._wrapped_criterion = models.DistributedFairseqModel(
+                    self.cfg.distributed_training,
+                    self._criterion,
+                    process_group=self.data_parallel_process_group,
+                    device=self.device,
+                )
+            else:
+                self._wrapped_criterion = self._criterion
+        return self._wrapped_criterion
+
+    @property
+    def model(self):
+        if self._wrapped_model is None:
+            if self.use_distributed_wrapper:
+                self._wrapped_model = models.DistributedFairseqModel(
+                    self.cfg.distributed_training,
+                    self._model,
+                    process_group=self.data_parallel_process_group,
+                    device=self.device,
+                )
+            else:
+                self._wrapped_model = self._model
+        return self._wrapped_model
+
+    @property
+    def ema(self):
+        if self._ema is None:
+            self._build_ema()
+        return self._ema
+
+    def _build_ema(self):
+        if self.cfg.ema.store_ema:
+            self._ema = build_ema(self._model, self.cfg.ema, self.device)
+            logger.info("Exponential Moving Average Shadow Model is initialized.")
+
+    @property
+    def optimizer(self):
+        if self._optimizer is None:
+            self._build_optimizer()
+        return self._optimizer
+
+    @property
+    def lr_scheduler(self):
+        if self._lr_scheduler is None:
+            self._build_optimizer()  # this will initialize self._lr_scheduler
+        return self._lr_scheduler
+
+    def _build_optimizer(self):
+
+        if (
+            self.cfg.optimization.debug_param_names
+            and self.cfg.common.fp16_no_flatten_grads
+        ):
+            params = []
+            self.param_names = []
+
+            for n, p in chain(
+                self.model.named_parameters(), self.criterion.named_parameters()
+            ):
+                if p.requires_grad:
+                    params.append(p)
+                    self.param_names.append(n)
+        else:
+            params = list(
+                filter(
+                    lambda p: p.requires_grad,
+                    chain(self.model.parameters(), self.criterion.parameters()),
+                )
+            )
+
+        if self.is_fsdp and self.cfg.common.fp16:
+            # FullyShardedDataParallel always uses MemoryEfficientFP16 wrapper,
+            # mostly for the grad scaling. But if we don't have the
+            # --memory-efficient-fp16 flag set, then we're effectively doing
+            # regular --fp16 and can allow the use of optimizers that would
+            # otherwise be unsupported by MemoryEfficientFP16Optimizer.
+            allow_unsupported = not self.cfg.common.memory_efficient_fp16
+            self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
+                self.cfg, params, allow_unsupported=allow_unsupported
+            )
+        elif self.cfg.common.fp16 or self.cfg.common.bf16 or self.cfg.common.amp:
+            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
+                logger.info(
+                    "NOTE: your device does NOT support faster training with --fp16 or --amp, "
+                    "please switch to FP32 which is likely to be faster"
+                )
+            if (
+                self.cfg.common.memory_efficient_fp16
+                or self.cfg.common.memory_efficient_bf16
+            ):
+                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
+                    self.cfg, params
+                )
+            elif self.cfg.common.amp:
+                self._optimizer = optim.AMPOptimizer.build_optimizer(self.cfg, params)
+            else:
+                self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params)
+        else:
+            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
+                logger.info(
+                    "NOTE: your device may support faster training with --fp16 or --amp"
+                )
+            self._optimizer = optim.build_optimizer(self.cfg.optimizer, params)
+
+        if self.is_fsdp:
+            assert (
+                not self.cfg.optimization.use_bmuf
+            ), "--ddp-backend=fully_sharded is not compatible with BMUF"
+            assert self._optimizer.supports_flat_params, (
+                "--ddp-backend=fully_sharded is only compatible with pointwise "
+                "optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.). "
+                "However, the sharding will result in slightly different results when "
+                "using non-pointwise optimizers (e.g., Adagrad, Adafactor, LAMB)"
+            )
+
+        if self.cfg.optimization.use_bmuf:
+            self._optimizer = optim.FairseqBMUF(
+                self.cfg.bmuf,
+                self._optimizer,
+            )
+
+        if self.cfg.distributed_training.zero_sharding == "os":
+            if (
+                self.cfg.common.fp16
+                and not self.cfg.common.memory_efficient_fp16
+                and not self.cfg.common.memory_efficient_bf16
+            ) and not self.cfg.common.fp16_no_flatten_grads:
+                raise ValueError(
+                    "ZeRO is incomptabile with fp16 and flattened grads. "
+                    "Please use --fp16-no-flatten-grads"
+                )
+            else:
+                optim.shard_(self._optimizer, self.data_parallel_process_group)
+
+        # We should initialize the learning rate scheduler immediately after
+        # building the optimizer, so that the initial learning rate is set.
+        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
+            self.cfg.lr_scheduler,
+            self.optimizer,
+        )
+        self._lr_scheduler.step_update(0)
+
+    @property
+    def is_fsdp(self):
+        return self.cfg.distributed_training.ddp_backend == "fully_sharded"
+
+    def consolidate_optimizer(self):
+        """For OSS, we need to consolidate the state dict."""
+        if self.cfg.checkpoint.no_save_optimizer_state:
+            return
+        self._gathered_optim_state = None
+        if hasattr(self.optimizer.optimizer, "consolidate_state_dict"):
+            self.optimizer.optimizer.consolidate_state_dict()
+        elif self.is_fsdp and not self.model.use_sharded_state:
+            st = self.model.gather_full_optim_state_dict(
+                self.optimizer
+            )  # only returns on rank 0
+            self._gathered_optim_state = st
+
+    def state_dict(self):
+        state_dict = {
+            "args": None,  # legacy
+            "cfg": (
+                OmegaConf.to_container(self.cfg, resolve=True, enum_to_str=True)
+                if OmegaConf.is_config(self.cfg)
+                else self.cfg
+            ),
+            "model": self.model.state_dict(),
+            "criterion": (
+                self.criterion.state_dict()
+                if utils.has_parameters(self.criterion)
+                else None
+            ),
+            "optimizer_history": (self._optim_history or [])
+            + [
+                {
+                    "criterion_name": self.get_criterion().__class__.__name__,
+                    "optimizer_name": self.optimizer.__class__.__name__,
+                    "lr_scheduler_state": self.lr_scheduler.state_dict(),
+                    "num_updates": self.get_num_updates(),
+                }
+            ],
+            "task_state": self.task.state_dict() if self.task is not None else {},
+            "extra_state": {
+                "metrics": metrics.state_dict(),
+                "previous_training_time": self.cumulative_training_time(),
+            },
+        }
+        if self.cfg.ema.store_ema:
+            # Save EMA model state as extra state
+            state_dict["extra_state"]["ema"] = self.ema.get_model().state_dict()
+            if self.cfg.ema.ema_fp32:
+                # Save EMA params in fp32
+                state_dict["extra_state"]["ema_fp32_params"] = self.ema.fp32_params
+        if not self.cfg.checkpoint.no_save_optimizer_state:
+            if self._gathered_optim_state is not None:
+                state_dict["last_optimizer_state"] = self._gathered_optim_state
+                self._gathered_optim_state = None
+            else:
+                state_dict["last_optimizer_state"] = self.optimizer.state_dict()
+        if self.is_fsdp:
+            # save meta data for recombining checkpoint upon loading
+            state_dict["fsdp_metadata"] = self.model.local_metadata_dict()
+        return state_dict
+
+    def save_checkpoint(self, filename, extra_state):
+        """Save all training state in a checkpoint file."""
+        if self.should_save_checkpoint_on_current_rank:
+
+            logger.info(f"Saving checkpoint to {os.path.abspath(filename)}")
+            # call state_dict on all ranks in case it needs internal communication
+            state_dict = utils.move_to_cpu(self.state_dict())
+            state_dict["extra_state"].update(extra_state)
+
+            checkpoint_utils.torch_persistent_save(
+                state_dict,
+                filename,
+                async_write=self.cfg.checkpoint.write_checkpoints_asynchronously,
+            )
+            logger.info(f"Finished saving checkpoint to {os.path.abspath(filename)}")
+            return os.path.abspath(filename)
+        return None
+
+    def load_checkpoint(
+        self,
+        filename,
+        reset_optimizer=False,
+        reset_lr_scheduler=False,
+        optimizer_overrides=None,
+        reset_meters=False,
+    ):
+        """
+        Load all training state from a checkpoint file.
+        rank = 0 will load the checkpoint, and then broadcast it to all
+        other ranks.
+        """
+        extra_state, self._optim_history, last_optim_state = None, [], None
+
+        logger.info(f"Preparing to load checkpoint {filename}")
+        is_distributed = self.data_parallel_world_size > 1
+        bexists = PathManager.isfile(filename)
+        if bexists:
+            load_on_all_ranks = (
+                self.cfg.checkpoint.load_checkpoint_on_all_dp_ranks
+                # TPUs don't support broadcast yet, so load checkpoints
+                # on every worker for now
+                or self.tpu
+                # FSDP requires loading checkpoint shards on all ranks
+                or (self.is_fsdp and self.cfg.distributed_training.use_sharded_state)
+                or getattr(self.cfg.model, "base_layers", 0) > 0
+            )
+
+            if load_on_all_ranks or self.data_parallel_rank == 0:
+                state = checkpoint_utils.load_checkpoint_to_cpu(
+                    filename, load_on_all_ranks=load_on_all_ranks
+                )
+                last_optim_state = state.get("last_optimizer_state", None)
+
+                # If doing zero_sharding, do not broadcast global optimizer
+                # state. Later we will broadcast sharded states to each rank
+                # to avoid memory from exploding.
+                if (
+                    not load_on_all_ranks
+                    and self.cfg.distributed_training.zero_sharding == "os"
+                    and "last_optimizer_state" in state
+                    and is_distributed
+                ):
+                    state["last_optimizer_state"] = "SHARDED"
+            else:
+                last_optim_state = None
+                state = None
+
+            if is_distributed and not load_on_all_ranks:
+                state = distributed_utils.broadcast_object(
+                    state,
+                    src_rank=0,
+                    group=self.data_parallel_process_group,
+                    dist_device=self.device,
+                )
+                if self.data_parallel_rank > 0:
+                    last_optim_state = state.get("last_optimizer_state", None)
+
+            # load model parameters
+            try:
+                if (
+                    "optimizer_history" in state
+                    and len(state["optimizer_history"]) > 0
+                    and "num_updates" in state["optimizer_history"][-1]
+                ):
+                    self.model.set_num_updates(
+                        state["optimizer_history"][-1]["num_updates"]
+                    )
+
+                # this is the code related to AdaPrune
+                # In short, it removes redundant heads in multi-head attention module based on heads importance provided
+                # For more info, please refer to the paper: https://openreview.net/forum?id=_CMSV7FTzGI
+                # The idea of prune in mha can be summarized as
+                # Fine tune model (e.g. roberta encoder) on a certain datasets with regularization
+                # After the model is trained. User could use get_reserve_head_index and _adaptive_prune_heads functions to get the top X heads with most importance.
+                # Then user uses the rank to prune a new roberta encoder and save the pruned ckpt manually.
+                # User will fine tune the the new roberta encoder via the ckpt saved above
+                # To get rid of registering different pruned version of Roberta, I use the argument --mha-heads-to-keep to prune the Roberta model into a pruned version which matches the pruned ckpt.
+                if (
+                    safe_hasattr(self.model, "args")
+                    and safe_hasattr(self.model.args, "mha_heads_to_keep")
+                    and self.model.args.mha_heads_to_keep != -1
+                ):
+                    logger.info(
+                        f"Prune model: keep {self.model.args.mha_heads_to_keep} heads for each multihead attention module"
+                    )
+                    for layer in self.model.encoder.sentence_encoder.layers:
+                        reserve_head_index = layer.self_attn._get_reserve_head_index(
+                            num_heads_to_keep=self.model.args.mha_heads_to_keep
+                        )
+                        layer.self_attn._adaptive_prune_heads(
+                            reserve_head_index=reserve_head_index
+                        )
+                        layer.self_attn._set_skip_embed_dim_check()
+                    logger.info(self.model)
+                # this is the code related to AdaPrune
+                # In short, it removes redundant units in feedforward layer in each transformer layer based on importance
+                # For more info, please refer to the paper: https://openreview.net/forum?id=_CMSV7FTzGI
+                # The idea of prune in ffn can be summarized as
+                # Fine tune model (e.g. roberta encoder) on a certain datasets with regularization
+                # After the model is trained. User could use _get_fc_rank and _prune_fc_layer functions to get the top X units with most importance.
+                # Then user uses the rank to prune a new roberta encoder and save the pruned ckpt manually.
+                # User will fine tune the the new roberta encoder via the ckpt saved above
+                # To get rid of registering different pruned version of Roberta, I use the argument --ffn-blocks-to-remove to prune the Roberta model into a pruned version which matches the pruned ckpt.
+                if (
+                    safe_hasattr(self.model, "args")
+                    and safe_hasattr(self.model.args, "ffn_blocks_to_remove")
+                    and self.model.args.ffn_blocks_to_remove != -1
+                ):
+                    logger.info(
+                        f"Prune model: remove {self.model.args.ffn_blocks_to_remove} ffn blocks for each transformer layer"
+                    )
+                    for layer in self.model.encoder.sentence_encoder.layers:
+                        remove_index = layer._get_fc_rank(
+                            remove_num=self.model.args.ffn_blocks_to_remove
+                        )
+                        layer._prune_fc_layer(remove_index=remove_index)
+                    logger.info(self.model)
+
+                self.model.load_state_dict(
+                    state["model"], strict=True, model_cfg=self.cfg.model
+                )
+                # save memory for later steps
+                del state["model"]
+                if utils.has_parameters(self.get_criterion()):
+                    self.get_criterion().load_state_dict(
+                        state["criterion"], strict=True
+                    )
+                    del state["criterion"]
+
+            except Exception:
+                raise Exception(
+                    "Cannot load model parameters from checkpoint {}; "
+                    "please ensure that the architectures match.".format(filename)
+                )
+            extra_state = state["extra_state"]
+            self._optim_history = state["optimizer_history"]
+
+        if last_optim_state is not None and not reset_optimizer:
+            # rebuild optimizer after loading model, since params may have changed
+            self._build_optimizer()
+
+            # only reload optimizer and lr_scheduler if they match
+            last_optim = self._optim_history[-1]
+            assert (
+                last_optim["criterion_name"] == self.get_criterion().__class__.__name__
+            ), f"Criterion does not match; please reset the optimizer (--reset-optimizer). {last_optim['criterion_name']} vs {self.get_criterion().__class__.__name__}"
+            assert (
+                last_optim["optimizer_name"] == self.optimizer.__class__.__name__
+            ), f"Optimizer does not match; please reset the optimizer (--reset-optimizer). {last_optim['optimizer_name']} vs {self.optimizer.__class__.__name__}"
+
+            if not reset_lr_scheduler:
+                self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"])
+
+            if self.is_fsdp and not self.model.use_sharded_state:
+                # if use_sharded_state, the last_optim_state is already sharded, skip this
+                last_optim_state = self.model.get_shard_from_optim_state_dict(
+                    last_optim_state
+                )
+            elif not load_on_all_ranks and is_distributed:
+                last_optim_state = self.optimizer.broadcast_global_state_dict(
+                    last_optim_state
+                )
+
+            self.optimizer.load_state_dict(last_optim_state, optimizer_overrides)
+
+            self.set_num_updates(last_optim["num_updates"])
+
+        if extra_state is not None:
+            itr_state = extra_state["train_iterator"]
+            epoch = itr_state["epoch"]
+
+            if "previous_training_time" in extra_state:
+                self._previous_training_time = extra_state["previous_training_time"]
+                self._start_time = time.time()
+
+            self.lr_step(epoch)
+
+            if (
+                itr_state.get("version", 1) >= 2
+                and itr_state["iterations_in_epoch"] == 0
+            ):
+                # reset meters at start of epoch
+                reset_meters = True
+
+            if "metrics" in extra_state and not reset_meters:
+                metrics.load_state_dict(extra_state["metrics"])
+
+                # reset TimeMeters, since their start times don't make sense anymore
+                for meter in metrics.get_meters("default"):
+                    if isinstance(meter, meters.TimeMeter):
+                        meter.reset()
+
+            if self.cfg.ema.store_ema:
+                if "ema" not in extra_state:
+                    logger.warn(
+                        "EMA not found in checkpoint. But store_ema is True. "
+                        "EMA is re-initialized from checkpoint."
+                    )
+                    self.ema.restore(
+                        state["model"], build_fp32_params=self.cfg.ema.ema_fp32
+                    )
+                else:
+                    logger.info("Loading EMA from checkpoint")
+                    self.ema.restore(extra_state["ema"], build_fp32_params=False)
+
+                    if self.cfg.ema.ema_fp32:
+                        if "ema_fp32_params" in extra_state:
+                            logger.info("Loading EMA fp32 params from checkpoint")
+                            self.ema.build_fp32_params(extra_state["ema_fp32_params"])
+                        else:
+                            logger.info(
+                                "Building EMA fp32 params from EMA model in checkpoint"
+                            )
+                            self.ema.build_fp32_params()
+
+            logger.info(
+                "Loaded checkpoint {} (epoch {} @ {} updates)".format(
+                    filename, epoch, self.get_num_updates()
+                )
+            )
+
+        else:
+            logger.info("No existing checkpoint found {}".format(filename))
+
+        return extra_state
+
+    def get_train_iterator(
+        self,
+        epoch,
+        combine=True,
+        load_dataset=True,
+        data_selector=None,
+        shard_batch_itr=True,
+        disable_iterator_cache=False,
+    ):
+        """Return an EpochBatchIterator over the training set for a given epoch."""
+        if load_dataset:
+            logger.info("loading train data for epoch {}".format(epoch))
+            self.task.load_dataset(
+                self.cfg.dataset.train_subset,
+                epoch=epoch,
+                combine=combine,
+                data_selector=data_selector,
+                tpu=self.tpu,
+            )
+        batch_iterator = self.task.get_batch_iterator(
+            dataset=self.task.dataset(self.cfg.dataset.train_subset),
+            max_tokens=self.cfg.dataset.max_tokens,
+            max_sentences=self.cfg.dataset.batch_size,
+            max_positions=utils.resolve_max_positions(
+                self.task.max_positions(),
+                self.model.max_positions(),
+                self.cfg.dataset.max_tokens,
+            ),
+            ignore_invalid_inputs=True,
+            required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple,
+            seed=(self.cfg.common.seed + epoch)
+            if self.cfg.dataset.update_ordered_indices_seed
+            else self.cfg.common.seed,
+            num_shards=self.data_parallel_world_size if shard_batch_itr else 1,
+            shard_id=self.data_parallel_rank if shard_batch_itr else 0,
+            num_workers=self.cfg.dataset.num_workers,
+            epoch=epoch,
+            data_buffer_size=self.cfg.dataset.data_buffer_size,
+            disable_iterator_cache=disable_iterator_cache,
+            skip_remainder_batch=self.cfg.optimization.skip_remainder_batch,
+            grouped_shuffling=self.cfg.dataset.grouped_shuffling,
+            update_epoch_batch_itr=self.cfg.dataset.update_epoch_batch_itr,
+        )
+        self.reset_dummy_batch(batch_iterator.first_batch)
+        return batch_iterator
+
+    def get_valid_iterator(
+        self,
+        subset,
+        disable_iterator_cache=False,
+    ):
+        """Return an EpochBatchIterator over given validation subset for a given epoch."""
+        batch_iterator = self.task.get_batch_iterator(
+            dataset=self.task.dataset(subset),
+            max_tokens=self.cfg.dataset.max_tokens_valid,
+            max_sentences=self.cfg.dataset.batch_size_valid,
+            max_positions=utils.resolve_max_positions(
+                self.task.max_positions(),
+                self.model.max_positions(),
+            ),
+            ignore_invalid_inputs=self.cfg.dataset.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple,
+            seed=self.cfg.common.seed,
+            num_shards=self.data_parallel_world_size,
+            shard_id=self.data_parallel_rank,
+            num_workers=self.cfg.dataset.num_workers,
+            # always pass a fixed "epoch" to keep validation data consistent
+            # across training epochs
+            epoch=1,
+            data_buffer_size=self.cfg.dataset.data_buffer_size,
+            disable_iterator_cache=disable_iterator_cache,
+            skip_remainder_batch=False,
+        )
+        self.reset_dummy_batch(batch_iterator.first_batch)
+        return batch_iterator
+
+    def begin_epoch(self, epoch):
+        """Called at the beginning of each epoch."""
+        logger.info("begin training epoch {}".format(epoch))
+
+        self.lr_step_begin_epoch(epoch)
+
+        if self.quantizer is not None:
+            self.quantizer.begin_epoch(epoch)
+
+        # task specific setup per epoch
+        self.task.begin_epoch(epoch, self.get_model())
+
+        if self.tpu:
+            import torch_xla.core.xla_model as xm
+
+            xm.rendezvous("begin_epoch")  # wait for all workers
+            xm.mark_step()
+
+    def begin_valid_epoch(self, epoch):
+        """Called at the beginning of each validation epoch."""
+
+        # task specific setup per validation epoch
+        self.task.begin_valid_epoch(epoch, self.get_model())
+
+    def reset_dummy_batch(self, batch):
+        self._dummy_batch = batch
+
+    @metrics.aggregate("train")
+    def train_step(self, samples, raise_oom=False):
+        """Do forward, backward and parameter update."""
+        self._set_seed()
+        self.model.train()
+        self.criterion.train()
+        self.zero_grad()
+
+        metrics.log_start_time("train_wall", priority=800, round=0)
+
+        # If EMA is enabled through store_ema=True
+        # and task.uses_ema is True, pass the EMA model as a keyword
+        # argument to the task.
+        extra_kwargs = {}
+        if self.cfg.ema.store_ema and getattr(self.task, "uses_ema", False):
+            extra_kwargs["ema_model"] = self.ema.get_model()
+
+        has_oom = False
+
+        # forward and backward pass
+        logging_outputs, sample_size, ooms = [], 0, 0
+        for i, sample in enumerate(samples):  # delayed update loop
+            sample, is_dummy_batch = self._prepare_sample(sample)
+
+            def maybe_no_sync():
+                """
+                Whenever *samples* contains more than one mini-batch, we
+                want to accumulate gradients locally and only call
+                all-reduce in the last backwards pass.
+                """
+                if (
+                    self.data_parallel_world_size > 1
+                    and hasattr(self.model, "no_sync")
+                    and i < len(samples) - 1
+                    # The no_sync context manager results in increased memory
+                    # usage with FSDP, since full-size gradients will be
+                    # accumulated on each GPU. It's typically a better tradeoff
+                    # to do the extra communication with FSDP.
+                    and not self.is_fsdp
+                ):
+                    return self.model.no_sync()
+                else:
+                    return contextlib.ExitStack()  # dummy contextmanager
+
+            try:
+                with maybe_no_sync():
+                    # forward and backward
+                    loss, sample_size_i, logging_output = self.task.train_step(
+                        sample=sample,
+                        model=self.model,
+                        criterion=self.criterion,
+                        optimizer=self.optimizer,
+                        update_num=self.get_num_updates(),
+                        ignore_grad=is_dummy_batch,
+                        **extra_kwargs,
+                    )
+                    del loss
+
+                logging_outputs.append(logging_output)
+                sample_size += sample_size_i
+
+                # emptying the CUDA cache after the first step can
+                # reduce the chance of OOM
+                if self.cuda and self.get_num_updates() == 0:
+                    torch.cuda.empty_cache()
+            except RuntimeError as e:
+                if "out of memory" in str(e):
+                    self._log_oom(e)
+                    has_oom = True
+                    if raise_oom:
+                        raise e
+                else:
+                    raise e
+            except Exception:
+                self.consolidate_optimizer()
+                self.save_checkpoint(
+                    os.path.join(self.cfg.checkpoint.save_dir, "crash.pt"), {}
+                )
+                raise
+
+            if has_oom:
+                logger.warning(
+                    "attempting to recover from OOM in forward/backward pass"
+                )
+                ooms += 1
+                self.zero_grad()
+                if self.cuda:
+                    torch.cuda.empty_cache()
+
+                if self.cfg.distributed_training.distributed_world_size == 1:
+                    return None
+
+            if self.tpu and i < len(samples) - 1:
+                # tpu-comment: every XLA operation before marking step is
+                # appended to the IR graph, and processing too many batches
+                # before marking step can lead to OOM errors.
+                # To handle gradient accumulation use case, we explicitly
+                # mark step here for every forward pass without a backward pass
+                self._xla_markstep_and_send_to_cpu()
+
+        if is_dummy_batch:
+            if torch.is_tensor(sample_size):
+                sample_size.zero_()
+            else:
+                sample_size *= 0.0
+
+        if torch.is_tensor(sample_size):
+            sample_size = sample_size.float()
+        else:
+            sample_size = float(sample_size)
+
+        # gather logging outputs from all replicas
+        if self._sync_stats():
+            train_time = self._local_cumulative_training_time()
+            (
+                logging_outputs,
+                (
+                    sample_size,
+                    ooms,
+                    total_train_time,
+                ),
+            ) = self._aggregate_logging_outputs(
+                logging_outputs, sample_size, ooms, train_time, ignore=is_dummy_batch
+            )
+            self._cumulative_training_time = (
+                total_train_time / self.data_parallel_world_size
+            )
+
+        overflow = False
+        try:
+            with torch.autograd.profiler.record_function("reduce-grads"):
+                # reduce gradients across workers
+                self.optimizer.all_reduce_grads(self.model)
+                if utils.has_parameters(self.criterion):
+                    self.optimizer.all_reduce_grads(self.criterion)
+
+            with torch.autograd.profiler.record_function("multiply-grads"):
+                # multiply gradients by (data_parallel_size / sample_size) since
+                # DDP normalizes by the number of data parallel workers for
+                # improved fp16 precision.
+                # Thus we get (sum_of_gradients / sample_size) at the end.
+                # In case of fp16, this step also undoes loss scaling.
+                # (Debugging note: Some optimizers perform this scaling on the
+                # fly, so inspecting model.parameters() or optimizer.params may
+                # still show the original, unscaled gradients.)
+                numer = (
+                    self.data_parallel_world_size
+                    if not self.cfg.optimization.use_bmuf or self._sync_stats()
+                    else 1
+                )
+                self.optimizer.multiply_grads(numer / (sample_size or 1.0))
+                # Note: (sample_size or 1.0) handles the case of a zero gradient, in a
+                # way that avoids CPU/device transfers in case sample_size is a GPU or
+                # TPU object. The assumption is that the gradient itself is also 0.
+
+            with torch.autograd.profiler.record_function("clip-grads"):
+                # clip grads
+                grad_norm = self.clip_grad_norm(self.cfg.optimization.clip_norm)
+
+            # check that grad norms are consistent across workers
+            # on tpu check tensor is slow
+            if not self.tpu:
+                if (
+                    not self.cfg.optimization.use_bmuf
+                    and self.cfg.distributed_training.ddp_backend != "slowmo"
+                ):
+                    self._check_grad_norms(grad_norm)
+                if not torch.isfinite(grad_norm).all():
+                    # in case of AMP, if gradients are Nan/Inf then
+                    # optimizer step is still required
+                    if self.cfg.common.amp:
+                        overflow = True
+                    else:
+                        # check local gradnorm single GPU case, trigger NanDetector
+                        raise FloatingPointError("gradients are Nan/Inf")
+
+            with torch.autograd.profiler.record_function("optimizer"):
+                # take an optimization step
+                self.task.optimizer_step(
+                    self.optimizer, model=self.model, update_num=self.get_num_updates()
+                )
+                if self.cfg.common.amp and overflow:
+                    if self._amp_retries == self.cfg.common.amp_batch_retries:
+                        logger.info("AMP: skipping this batch.")
+                        self._amp_retries = 0
+                    else:
+                        self._amp_retries += 1
+                        return self.train_step(
+                            samples, raise_oom
+                        )  # recursion to feed in same batch
+
+        except FloatingPointError:
+
+            self.consolidate_optimizer()
+            self.save_checkpoint(
+                os.path.join(self.cfg.checkpoint.save_dir, "crash.pt"), {}
+            )
+
+            # re-run the forward and backward pass with hooks attached to print
+            # out where it fails
+            self.zero_grad()
+            with NanDetector(self.get_model()):
+                for _, sample in enumerate(samples):
+                    sample, _ = self._prepare_sample(sample)
+                    self.task.train_step(
+                        sample,
+                        self.model,
+                        self.criterion,
+                        self.optimizer,
+                        self.get_num_updates(),
+                        ignore_grad=False,
+                        **extra_kwargs,
+                    )
+            raise
+        except OverflowError as e:
+            overflow = True
+            logger.info(
+                f"NOTE: gradient overflow detected, ignoring gradient, {str(e)}"
+            )
+
+            if hasattr(self, "param_names") and hasattr(
+                self.optimizer, "fp32_optimizer"
+            ):
+                for p, n in zip(self.optimizer.fp32_optimizer.params, self.param_names):
+                    if torch.isinf(p.grad).any() or torch.isnan(p.grad).any():
+                        logger.info(f"overflow in param {n}")
+
+            grad_norm = torch.tensor(0.0).cuda()
+            self.zero_grad()
+        except RuntimeError as e:
+            if "out of memory" in str(e):
+                self._log_oom(e)
+                logger.error("OOM during optimization, irrecoverable")
+            raise e
+
+        # Some distributed wrappers (e.g., SlowMo) need access to the optimizer
+        # after the step
+        if hasattr(self.model, "perform_slowmo"):
+            self.model.perform_slowmo(
+                self.optimizer.optimizer, getattr(self.optimizer, "fp32_params", None)
+            )
+
+        logging_output = None
+        if not overflow or self.cfg.distributed_training.ddp_backend == "slowmo":
+            self.set_num_updates(self.get_num_updates() + 1)
+
+            if self.cfg.ema.store_ema:
+                # Step EMA forward with new model.
+                self.ema.step(
+                    self.get_model(),
+                    self.get_num_updates(),
+                )
+                metrics.log_scalar(
+                    "ema_decay",
+                    self.ema.get_decay(),
+                    priority=10000,
+                    round=5,
+                    weight=0,
+                )
+
+            if self.tpu:
+                import torch_xla.core.xla_model as xm
+
+                # mark step on TPUs
+                self._xla_markstep_and_send_to_cpu()
+
+                # only log stats every log_interval steps
+                # this causes wps to be misreported when log_interval > 1
+                logging_output = {}
+                if self.get_num_updates() % self.cfg.common.log_interval == 0:
+                    # log memory usage
+                    mem_info = xm.get_memory_info(self.device)
+                    gb_free = mem_info["kb_free"] / 1024 / 1024
+                    gb_total = mem_info["kb_total"] / 1024 / 1024
+                    metrics.log_scalar(
+                        "gb_free", gb_free, priority=1500, round=1, weight=0
+                    )
+                    metrics.log_scalar(
+                        "gb_total", gb_total, priority=1600, round=1, weight=0
+                    )
+                    logging_outputs = self._xla_markstep_and_send_to_cpu(
+                        logging_outputs
+                    )
+                    logging_output = self._reduce_and_log_stats(
+                        logging_outputs, sample_size, grad_norm
+                    )
+
+                # log whenever there's an XLA compilation, since these
+                # slow down training and may indicate opportunities for
+                # optimization
+                self._check_xla_compilation()
+            else:
+                if self.cuda and self.cuda_env is not None:
+                    # log minimum free memory over the iteration
+                    gb_used = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
+                    torch.cuda.reset_peak_memory_stats()
+                    gb_free = self.cuda_env.total_memory_in_GB - gb_used
+                    metrics.log_scalar(
+                        "gb_free", gb_free, priority=1500, round=1, weight=0
+                    )
+
+                # log stats
+                logging_output = self._reduce_and_log_stats(
+                    logging_outputs, sample_size, grad_norm
+                )
+
+                # clear CUDA cache to reduce memory fragmentation
+                if (
+                    self.cuda
+                    and self.cfg.common.empty_cache_freq > 0
+                    and (
+                        (self.get_num_updates() + self.cfg.common.empty_cache_freq - 1)
+                        % self.cfg.common.empty_cache_freq
+                    )
+                    == 0
+                ):
+                    torch.cuda.empty_cache()
+
+        if self.cfg.common.fp16 or self.cfg.common.amp:
+            metrics.log_scalar(
+                "loss_scale",
+                (
+                    self.optimizer.scaler.loss_scale
+                    if self.cfg.common.fp16
+                    else self.optimizer.scaler.get_scale()
+                ),
+                priority=700,
+                round=4,
+                weight=0,
+            )
+
+        metrics.log_stop_time("train_wall")
+        return logging_output
+
+    @metrics.aggregate("valid")
+    def valid_step(self, sample, raise_oom=False):
+        """Do forward pass in evaluation mode."""
+        if self.tpu:
+            import torch_xla.core.xla_model as xm
+
+            xm.rendezvous("valid_step")  # wait for all workers
+
+        # If EMA is enabled through store_ema=True
+        # and task.uses_ema is True, pass the EMA model as a keyword
+        # argument to the task.
+        extra_kwargs = {}
+        if self.cfg.ema.store_ema and getattr(self.task, "uses_ema", False):
+            extra_kwargs["ema_model"] = self.ema.get_model()
+
+        with torch.no_grad():
+            self.model.eval()
+            self.criterion.eval()
+
+            sample, is_dummy_batch = self._prepare_sample(sample)
+
+            try:
+                _loss, sample_size, logging_output = self.task.valid_step(
+                    sample, self.model, self.criterion, **extra_kwargs
+                )
+            except RuntimeError as e:
+                if "out of memory" in str(e):
+                    self._log_oom(e)
+                    if not raise_oom:
+                        logger.warning(
+                            "ran out of memory in validation step, retrying batch"
+                        )
+                        for p in self.model.parameters():
+                            if p.grad is not None:
+                                p.grad = None  # free some memory
+                        if self.cuda:
+                            torch.cuda.empty_cache()
+                        return self.valid_step(sample, raise_oom=True)
+                raise e
+
+            logging_outputs = [logging_output]
+            if is_dummy_batch:
+                if torch.is_tensor(sample_size):
+                    sample_size.zero_()
+                else:
+                    sample_size *= 0.0
+
+        # gather logging outputs from all replicas
+        if self.data_parallel_world_size > 1:
+            logging_outputs, (sample_size,) = self._aggregate_logging_outputs(
+                logging_outputs,
+                sample_size,
+                ignore=is_dummy_batch,
+            )
+
+        # log validation stats
+        if self.tpu:
+            logging_outputs = self._xla_markstep_and_send_to_cpu(logging_outputs)
+        logging_output = self._reduce_and_log_stats(logging_outputs, sample_size)
+
+        return logging_output
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+    def lr_step_begin_epoch(self, epoch):
+        """Adjust the learning rate at the beginning of the epoch."""
+        self.lr_scheduler.step_begin_epoch(epoch)
+        # prefer updating the LR based on the number of steps
+        return self.lr_step_update()
+
+    def lr_step(self, epoch, val_loss=None):
+        """Adjust the learning rate at the end of the epoch."""
+        self.lr_scheduler.step(epoch, val_loss)
+        # prefer updating the LR based on the number of steps
+        return self.lr_step_update()
+
+    def lr_step_update(self):
+        """Update the learning rate after each update."""
+        new_lr = self.lr_scheduler.step_update(self.get_num_updates())
+        if isinstance(new_lr, dict):
+            for k, v in new_lr.items():
+                metrics.log_scalar(f"lr_{k}", v, weight=0, priority=300)
+            new_lr = new_lr.get("default", next(iter(new_lr.values())))
+        else:
+            metrics.log_scalar("lr", new_lr, weight=0, priority=300)
+        return new_lr
+
+    def get_lr(self):
+        """Get the current learning rate."""
+        return self.optimizer.get_lr()
+
+    def get_model(self):
+        """Get the (non-wrapped) model instance."""
+        return self._model
+
+    def get_criterion(self):
+        """Get the (non-wrapped) criterion instance."""
+        return self._criterion
+
+    def get_meter(self, name):
+        """[deprecated] Get a specific meter by name."""
+        from fairseq import meters
+
+        if "get_meter" not in self._warn_once:
+            self._warn_once.add("get_meter")
+            utils.deprecation_warning(
+                "Trainer.get_meter is deprecated. Please use fairseq.metrics instead."
+            )
+
+        train_meters = metrics.get_meters("train")
+        if train_meters is None:
+            train_meters = {}
+
+        if name == "train_loss" and "loss" in train_meters:
+            return train_meters["loss"]
+        elif name == "train_nll_loss":
+            # support for legacy train.py, which assumed this meter is
+            # always initialized
+            m = train_meters.get("nll_loss", None)
+            return m or meters.AverageMeter()
+        elif name == "wall":
+            # support for legacy train.py, which assumed this meter is
+            # always initialized
+            m = metrics.get_meter("default", "wall")
+            return m or meters.TimeMeter()
+        elif name == "wps":
+            m = metrics.get_meter("train", "wps")
+            return m or meters.TimeMeter()
+        elif name in {"valid_loss", "valid_nll_loss"}:
+            # support for legacy train.py, which assumed these meters
+            # are always initialized
+            k = name[len("valid_") :]
+            m = metrics.get_meter("valid", k)
+            return m or meters.AverageMeter()
+        elif name == "oom":
+            return meters.AverageMeter()
+        elif name in train_meters:
+            return train_meters[name]
+        return None
+
+    def get_num_updates(self):
+        """Get the number of parameters updates."""
+        return self._num_updates
+
+    def set_num_updates(self, num_updates):
+        """Set the number of parameters updates."""
+        self._num_updates = num_updates
+        self.lr_step_update()
+        if self.quantizer:
+            self.quantizer.step_update(self._num_updates)
+        metrics.log_scalar("num_updates", self._num_updates, weight=0, priority=200)
+
+    def clip_grad_norm(self, clip_norm):
+        def agg_norm_fn(total_norm):
+            total_norm = total_norm.cuda().float() ** 2
+            total_norm = distributed_utils.all_reduce(
+                total_norm, group=self.data_parallel_process_group
+            )
+            return total_norm**0.5
+
+        should_agg_norm = self.is_fsdp and (
+            self.data_parallel_process_group is not None
+            or torch.distributed.is_initialized()
+        )
+        return self.optimizer.clip_grad_norm(
+            clip_norm, aggregate_norm_fn=agg_norm_fn if should_agg_norm else None
+        )
+
+    def cumulative_training_time(self):
+        if self._cumulative_training_time is None:
+            # single GPU
+            return self._local_cumulative_training_time()
+        else:
+            return self._cumulative_training_time
+
+    def _local_cumulative_training_time(self):
+        """Aggregate training time in seconds."""
+        return time.time() - self._start_time + self._previous_training_time
+
+    def _fp_convert_sample(self, sample):
+        def apply_half(t):
+            if t.dtype is torch.float32:
+                return t.to(dtype=torch.half)
+            return t
+
+        def apply_bfloat16(t):
+            if t.dtype is torch.float32:
+                return t.to(dtype=torch.bfloat16)
+            return t
+
+        if self.cfg.common.fp16:
+            sample = utils.apply_to_sample(apply_half, sample)
+
+        if self.cfg.common.bf16:
+            sample = utils.apply_to_sample(apply_bfloat16, sample)
+
+        return sample
+
+    def _prepare_sample(self, sample, is_dummy=False):
+        if sample == "DUMMY":
+            raise Exception(
+                "Trying to use an uninitialized 'dummy' batch. This usually indicates "
+                "that the total number of batches is smaller than the number of "
+                "participating GPUs. Try reducing the batch size or using fewer GPUs."
+            )
+
+        if sample is None or len(sample) == 0:
+            assert (
+                self._dummy_batch is not None and len(self._dummy_batch) > 0
+            ), "Invalid dummy batch: {}".format(self._dummy_batch)
+            sample, _ = self._prepare_sample(self._dummy_batch, is_dummy=True)
+            return sample, True
+
+        # Given that PCIe/NVLink bandwidth is significantly smaller than DRAM bandwidth
+        # it makes sense to do the format conversion on the CPU and then transfer
+        # a smaller buffer to the device. This also saves GPU memory capacity.
+
+        if self.cfg.common.on_cpu_convert_precision:
+            sample = self._fp_convert_sample(sample)
+
+        if self.cuda:
+            if self.pipeline_model_parallel:
+                if "target" in sample:
+                    sample["target"] = utils.move_to_cuda(
+                        sample["target"], device=self.last_device
+                    )
+            else:
+                sample = utils.move_to_cuda(sample)
+        elif self.tpu and is_dummy:
+            # the dummy batch may not be on the appropriate device
+            sample = utils.move_to_cuda(sample, device=self.device)
+
+        if not self.cfg.common.on_cpu_convert_precision:
+            sample = self._fp_convert_sample(sample)
+
+        if self._dummy_batch == "DUMMY":
+            self._dummy_batch = sample
+
+        return sample, False
+
+    def _set_seed(self):
+        # Set seed based on args.seed and the update number so that we get
+        # reproducible results when resuming from checkpoints
+        seed = self.cfg.common.seed + self.get_num_updates()
+        utils.set_torch_seed(seed)
+
+    def _sync_stats(self):
+        # Return True if it's using multiple GPUs and DDP or multiple GPUs with
+        # BMUF and it's a bmuf sync with warmup iterations completed before.
+        if self.data_parallel_world_size == 1:
+            return False
+        elif self.cfg.optimization.use_bmuf:
+            return (
+                self.get_num_updates() + 1
+            ) % self.cfg.bmuf.global_sync_iter == 0 and (
+                self.get_num_updates() + 1
+            ) > self.cfg.bmuf.warmup_iterations
+        else:
+            return True
+
+    def _log_oom(self, exc):
+        msg = "OOM: Ran out of memory with exception: {}".format(exc)
+        logger.warning(msg)
+        if torch.cuda.is_available() and hasattr(torch.cuda, "memory_summary"):
+            for device_idx in range(torch.cuda.device_count()):
+                logger.warning(torch.cuda.memory_summary(device=device_idx))
+        sys.stderr.flush()
+
+    def _aggregate_logging_outputs(
+        self,
+        logging_outputs: List[Dict[str, Any]],
+        *extra_stats_to_sum,
+        ignore=False,
+    ):
+        if self.task.__class__.logging_outputs_can_be_summed(self.get_criterion()):
+            return self._fast_stat_sync_sum(
+                logging_outputs, *extra_stats_to_sum, ignore=ignore
+            )
+        else:
+            return self._all_gather_list_sync(
+                logging_outputs, *extra_stats_to_sum, ignore=ignore
+            )
+
+    def _all_gather_list_sync(
+        self,
+        logging_outputs: List[Dict[str, Any]],
+        *extra_stats_to_sum,
+        ignore=False,
+    ):
+        """
+        Sync logging outputs across workers. all_gather_list_sync is
+        suitable when logging outputs are complex types.
+        """
+        if self.tpu:
+            raise NotImplementedError
+        if ignore:
+            logging_outputs = []
+        results = list(
+            zip(
+                *distributed_utils.all_gather_list(
+                    [logging_outputs] + list(extra_stats_to_sum),
+                    max_size=getattr(self.cfg.common, "all_gather_list_size", 16384),
+                    group=self.data_parallel_process_group,
+                )
+            )
+        )
+        logging_outputs, extra_stats_to_sum = results[0], results[1:]
+        logging_outputs = list(chain.from_iterable(logging_outputs))
+        extra_stats_to_sum = [sum(s) for s in extra_stats_to_sum]
+        return logging_outputs, extra_stats_to_sum
+
+    def _fast_stat_sync_sum(
+        self,
+        logging_outputs: List[Dict[str, Any]],
+        *extra_stats_to_sum,
+        ignore=False,
+    ):
+        """
+        Sync logging outputs across workers. fast_stat_sync_sum is
+        faster than all_gather_list_sync, but is only suitable when
+        logging outputs are scalars and can be summed. Note that
+        *logging_outputs* cannot contain any nested dicts/lists.
+        """
+        data = {}
+        for i, stat in enumerate(extra_stats_to_sum):
+            data["extra_stats_" + str(i)] = stat
+        if len(logging_outputs) > 0:
+            log_keys = list(logging_outputs[0].keys())
+            for k in log_keys:
+                if not ignore:
+                    v = sum(log[k] for log in logging_outputs if k in log)
+                else:
+                    v = logging_outputs[0][k]
+                    v = torch.zeros_like(v) if torch.is_tensor(v) else 0
+                data["logging_outputs_" + k] = v
+        else:
+            log_keys = None
+
+        data = distributed_utils.all_reduce_dict(
+            data, device=self.device, group=self.data_parallel_process_group
+        )
+
+        extra_stats_to_sum = [
+            data["extra_stats_" + str(i)] for i in range(len(extra_stats_to_sum))
+        ]
+        if log_keys is not None:
+            logging_outputs = [{k: data["logging_outputs_" + k] for k in log_keys}]
+        else:
+            logging_outputs = []
+        return logging_outputs, extra_stats_to_sum
+
+    def _check_grad_norms(self, grad_norm):
+        """Check that grad norms are consistent across workers."""
+        if self._grad_norm_buf is not None:
+            self._grad_norm_buf.zero_()
+            self._grad_norm_buf[self.data_parallel_rank] = grad_norm
+            distributed_utils.all_reduce(
+                self._grad_norm_buf, group=self.data_parallel_process_group
+            )
+
+            def is_consistent(tensor):
+                max_abs_diff = torch.max(torch.abs(tensor - tensor[0]))
+                return (
+                    (
+                        torch.isfinite(tensor).all()
+                        and (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all()
+                    )
+                    or (self.cfg.common.amp and not torch.isfinite(tensor).all())
+                    # in case of amp non-finite grads are fine
+                )
+
+            if not is_consistent(self._grad_norm_buf):
+                pretty_detail = "\n".join(
+                    "rank {:3d} = {:.8f}".format(r, n)
+                    for r, n in enumerate(self._grad_norm_buf.tolist())
+                )
+                error_detail = "grad_norm across the workers:\n{}\n".format(
+                    pretty_detail
+                )
+                # use FloatingPointError to trigger NanDetector
+                raise FloatingPointError(
+                    "Fatal error: gradients are inconsistent between workers. "
+                    "Try --ddp-backend=legacy_ddp. "
+                    "Or are you mixing up different generation of GPUs in training?"
+                    + "\n"
+                    + "-" * 80
+                    + "\n{}\n".format(error_detail)
+                    + "-" * 80
+                )
+
+    def _reduce_and_log_stats(self, logging_outputs, sample_size, grad_norm=None):
+        if grad_norm is not None and (
+            not torch.is_tensor(grad_norm) or torch.isfinite(grad_norm)
+        ):
+            metrics.log_speed("ups", 1.0, priority=100, round=2)
+            metrics.log_scalar("gnorm", grad_norm, priority=400, round=3)
+            if self.cfg.optimization.clip_norm > 0:
+                metrics.log_scalar(
+                    "clip",
+                    torch.where(
+                        grad_norm > self.cfg.optimization.clip_norm,
+                        grad_norm.new_tensor(100),
+                        grad_norm.new_tensor(0),
+                    ),
+                    priority=500,
+                    round=1,
+                )
+
+        with metrics.aggregate() as agg:
+            if logging_outputs is not None:
+                self.task.reduce_metrics(logging_outputs, self.get_criterion())
+                del logging_outputs
+
+            # extra warning for criterions that don't properly log a loss value
+            if "loss" not in agg:
+                if "loss" not in self._warn_once:
+                    self._warn_once.add("loss")
+                    logger.warning(
+                        "Criterion.reduce_metrics did not log a 'loss' value, "
+                        "which may break some functionality"
+                    )
+                metrics.log_scalar("loss", -1)
+
+            # support legacy interface
+            if self.tpu:
+                logging_output = {}
+            else:
+                logging_output = agg.get_smoothed_values()
+                logging_output["sample_size"] = sample_size
+                for key_to_delete in ["ppl", "wps", "wpb", "bsz"]:
+                    if key_to_delete in logging_output:
+                        del logging_output[key_to_delete]
+            return logging_output
+
+    def _check_xla_compilation(self):
+        import torch_xla.debug.metrics as met
+
+        compile_stats = met.metric_data("CompileTime")
+        if compile_stats is None:
+            return
+        num_xla_compiles = compile_stats[0]
+        if num_xla_compiles > self._num_xla_compiles:
+            logger.warning(
+                "XLA compilation detected on device #{}; too many of these can lead "
+                "to slow training, but we expect a few in the beginning".format(
+                    self.cfg.distributed_training.distributed_rank
+                )
+            )
+        self._num_xla_compiles = num_xla_compiles
+
+    def _xla_markstep_and_send_to_cpu(self, data=None):
+        import torch_xla.core.xla_model as xm
+
+        xm.mark_step()
+        if data is not None:
+            from fairseq.utils import xla_device_to_cpu
+
+            return xla_device_to_cpu(data)
+
+
+def _catalog_shared_params(module, memo=None, prefix=""):
+    if memo is None:
+        first_call = True
+        memo = {}
+    else:
+        first_call = False
+    for name, param in module._parameters.items():
+        param_prefix = prefix + ("." if prefix else "") + name
+        if param not in memo:
+            memo[param] = []
+        memo[param].append(param_prefix)
+    for name, m in module._modules.items():
+        if m is None:
+            continue
+        submodule_prefix = prefix + ("." if prefix else "") + name
+        _catalog_shared_params(m, memo, submodule_prefix)
+    if first_call:
+        return [x for x in memo.values() if len(x) > 1]
+
+
+def _get_module_by_path(module, path):
+    path = path.split(".")
+    for name in path:
+        module = getattr(module, name)
+    return module
+
+
+def _set_module_by_path(module, path, value):
+    path = path.split(".")
+    for name in path[:-1]:
+        module = getattr(module, name)
+    setattr(module, path[-1], value)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
new file mode 100644
index 0000000000..4d4b350523
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
@@ -0,0 +1,951 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import collections
+import contextlib
+import copy
+import importlib
+import logging
+import os
+import sys
+import warnings
+from itertools import accumulate
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+if TYPE_CHECKING:
+    from fairseq.modules.multihead_attention import MultiheadAttention
+
+try:
+    from amp_C import multi_tensor_l2norm
+
+    multi_tensor_l2norm_available = True
+except ImportError:
+    multi_tensor_l2norm_available = False
+
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    xm = None
+
+
+logger = logging.getLogger(__name__)
+
+
+MANIFOLD_PATH_SEP = "|"
+
+
+class FileContentsAction(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+        super(FileContentsAction, self).__init__(option_strings, dest, **kwargs)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        from fairseq.file_io import PathManager
+
+        if PathManager.isfile(values):
+            with PathManager.open(values) as f:
+                argument = f.read().strip()
+        else:
+            argument = values
+        setattr(namespace, self.dest, argument)
+
+
+def split_paths(paths: str, separator=os.pathsep) -> List[str]:
+    return (
+        paths.split(separator) if "://" not in paths else paths.split(MANIFOLD_PATH_SEP)
+    )
+
+
+def load_ensemble_for_inference(filenames, task, model_arg_overrides=None):
+    from fairseq import checkpoint_utils
+
+    deprecation_warning(
+        "utils.load_ensemble_for_inference is deprecated. "
+        "Please use checkpoint_utils.load_model_ensemble instead."
+    )
+    return checkpoint_utils.load_model_ensemble(
+        filenames, arg_overrides=model_arg_overrides, task=task
+    )
+
+
+def apply_to_sample(f, sample):
+    if hasattr(sample, "__len__") and len(sample) == 0:
+        return {}
+
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, collections.OrderedDict):
+            # OrderedDict has attributes that needs to be preserved
+            od = collections.OrderedDict(
+                (key, _apply(value)) for key, value in x.items()
+            )
+            od.__dict__ = x.__dict__
+            return od
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        elif isinstance(x, tuple):
+            return tuple(_apply(x) for x in x)
+        elif isinstance(x, set):
+            return {_apply(x) for x in x}
+        else:
+            return x
+
+    return _apply(sample)
+
+
+def move_to_cuda(sample, device=None):
+    device = device or torch.cuda.current_device()
+
+    def _move_to_cuda(tensor):
+        # non_blocking is ignored if tensor is not pinned, so we can always set
+        # to True (see github.com/PyTorchLightning/pytorch-lightning/issues/620)
+        return tensor.to(device=device, non_blocking=True)
+
+    return apply_to_sample(_move_to_cuda, sample)
+
+
+def move_to_cpu(sample):
+    def _move_to_cpu(tensor):
+        # PyTorch has poor support for half tensors (float16) on CPU.
+        # Move any such tensors to float32.
+        if tensor.dtype in {torch.bfloat16, torch.float16}:
+            tensor = tensor.to(dtype=torch.float32)
+        return tensor.cpu()
+
+    return apply_to_sample(_move_to_cpu, sample)
+
+
+def move_to_tpu(sample):
+
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+
+    def _move_to_tpu(tensor):
+        return tensor.to(device)
+
+    return apply_to_sample(_move_to_tpu, sample)
+
+
+def get_incremental_state(
+    module: "MultiheadAttention",
+    incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+    key: str,
+) -> Optional[Dict[str, Optional[Tensor]]]:
+    """Helper for getting incremental state for an nn.Module."""
+    return module.get_incremental_state(incremental_state, key)
+
+
+def set_incremental_state(
+    module: "MultiheadAttention",
+    incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+    key: str,
+    value: Dict[str, Optional[Tensor]],
+) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+    """Helper for setting incremental state for an nn.Module."""
+    if incremental_state is not None:
+        result = module.set_incremental_state(incremental_state, key, value)
+        if result is not None:
+            incremental_state = result
+    return incremental_state
+
+
+def load_align_dict(replace_unk):
+    if replace_unk is None:
+        align_dict = None
+    elif isinstance(replace_unk, str) and len(replace_unk) > 0:
+        # Load alignment dictionary for unknown word replacement if it was passed as an argument.
+        align_dict = {}
+        with open(replace_unk, "r") as f:
+            for line in f:
+                cols = line.split()
+                align_dict[cols[0]] = cols[1]
+    else:
+        # No alignment dictionary provided but we still want to perform unknown word replacement by copying the
+        # original source word.
+        align_dict = {}
+    return align_dict
+
+
+def print_embed_overlap(embed_dict, vocab_dict):
+    embed_keys = set(embed_dict.keys())
+    vocab_keys = set(vocab_dict.symbols)
+    overlap = len(embed_keys & vocab_keys)
+    logger.info("found {}/{} types in embedding file".format(overlap, len(vocab_dict)))
+
+
+def parse_embedding(embed_path):
+    """Parse embedding text file into a dictionary of word and embedding tensors.
+
+    The first line can have vocabulary size and dimension. The following lines
+    should contain word and embedding separated by spaces.
+
+    Example:
+        2 5
+        the -0.0230 -0.0264  0.0287  0.0171  0.1403
+        at -0.0395 -0.1286  0.0275  0.0254 -0.0932
+    """
+    embed_dict = {}
+    with open(embed_path) as f_embed:
+        next(f_embed)  # skip header
+        for line in f_embed:
+            pieces = line.rstrip().split(" ")
+            embed_dict[pieces[0]] = torch.Tensor(
+                [float(weight) for weight in pieces[1:]]
+            )
+    return embed_dict
+
+
+def load_embedding(embed_dict, vocab, embedding):
+    for idx in range(len(vocab)):
+        token = vocab[idx]
+        if token in embed_dict:
+            embedding.weight.data[idx] = embed_dict[token]
+    return embedding
+
+
+def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
+    from fairseq import tokenizer
+
+    # Tokens are strings here
+    hypo_tokens = tokenizer.tokenize_line(hypo_str)
+    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
+    src_tokens = tokenizer.tokenize_line(src_str) + ["<eos>"]
+    for i, ht in enumerate(hypo_tokens):
+        if ht == unk:
+            src_token = src_tokens[alignment[i]]
+            # Either take the corresponding value in the aligned dictionary or just copy the original value.
+            hypo_tokens[i] = align_dict.get(src_token, src_token)
+    return " ".join(hypo_tokens)
+
+
+def post_process_prediction(
+    hypo_tokens,
+    src_str,
+    alignment,
+    align_dict,
+    tgt_dict,
+    remove_bpe=None,
+    extra_symbols_to_ignore=None,
+):
+    hypo_str = tgt_dict.string(
+        hypo_tokens, remove_bpe, extra_symbols_to_ignore=extra_symbols_to_ignore
+    )
+    if align_dict is not None:
+        hypo_str = replace_unk(
+            hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string()
+        )
+    if align_dict is not None or remove_bpe is not None:
+        # Convert back to tokens for evaluating with unk replacement or without BPE
+        # Note that the dictionary can be modified inside the method.
+        hypo_tokens = tgt_dict.encode_line(hypo_str, add_if_not_exist=True)
+    return hypo_tokens, hypo_str, alignment
+
+
+def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
+    """Replace non-padding symbols with their position numbers.
+
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(padding_idx).int()
+    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
+
+
+def strip_pad(tensor, pad):
+    return tensor[tensor.ne(pad)]
+
+
+def buffered_arange(max, device="cpu"):
+    if not hasattr(buffered_arange, "buf"):
+        buffered_arange.buf = torch.LongTensor().to(device)
+    if max > buffered_arange.buf.numel():
+        buffered_arange.buf.resize_(max)
+        torch.arange(max, out=buffered_arange.buf)
+    return buffered_arange.buf[:max]
+
+
+def convert_padding_direction(
+    src_tokens, padding_idx, right_to_left: bool = False, left_to_right: bool = False
+):
+    assert right_to_left ^ left_to_right
+    pad_mask = src_tokens.eq(padding_idx)
+    if not pad_mask.any():
+        # no padding, return early
+        return src_tokens
+    if left_to_right and not pad_mask[:, 0].any():
+        # already right padded
+        return src_tokens
+    if right_to_left and not pad_mask[:, -1].any():
+        # already left padded
+        return src_tokens
+    max_len = src_tokens.size(1)
+    buffered = torch.empty(0).long()
+    if max_len > 0:
+        torch.arange(max_len, out=buffered)
+    range = buffered.type_as(src_tokens).expand_as(src_tokens)
+    num_pads = pad_mask.long().sum(dim=1, keepdim=True)
+    if right_to_left:
+        index = torch.remainder(range - num_pads, max_len)
+    else:
+        index = torch.remainder(range + num_pads, max_len)
+    return src_tokens.gather(1, index)
+
+
+def item(tensor):
+    # tpu-comment: making this a no-op for xla devices.
+    if torch.is_tensor(tensor) and tensor.device.type == "xla":
+        return tensor.detach()
+    if hasattr(tensor, "item"):
+        return tensor.item()
+    if hasattr(tensor, "__getitem__"):
+        return tensor[0]
+    return tensor
+
+
+def multi_tensor_total_norm(grads, chunk_size=2048 * 32) -> torch.Tensor:
+    per_device_grads = {}
+    norms = []
+    for grad in grads:
+        device = grad.device
+        cur_device_grads = per_device_grads.get(device)
+        if cur_device_grads is None:
+            cur_device_grads = []
+            per_device_grads[device] = cur_device_grads
+        cur_device_grads.append(grad)
+    for device in per_device_grads.keys():
+        cur_device_grads = per_device_grads[device]
+        if device.type == "cuda":
+            # TODO(msb) return has_inf
+            has_inf = torch.zeros((1, 1), dtype=torch.int, device=device)
+            with torch.cuda.device(device):
+                norm = multi_tensor_l2norm(
+                    chunk_size, has_inf, [cur_device_grads], False
+                )
+            norms.append(norm[0].to(torch.cuda.current_device()))
+        else:
+            norms += [torch.norm(g, p=2, dtype=torch.float32) for g in cur_device_grads]
+    total_norm = torch.norm(torch.stack(norms))
+    return total_norm
+
+
+@torch.no_grad()
+def clip_grad_norm_(params, max_norm, aggregate_norm_fn=None) -> torch.Tensor:
+    def grad_exists(p):
+        return p is not None and getattr(p, "grad", None) is not None
+
+    if isinstance(params, torch.Tensor):
+        params = [params]
+    params = list(params)
+    grads = [
+        p.grad.detach() for p in params if grad_exists(p) and not hasattr(p, "expert")
+    ]
+    expert_grads = [
+        p.grad.detach() for p in params if grad_exists(p) and hasattr(p, "expert")
+    ]
+
+    if len(grads) == 0:
+        if len(params) > 0:
+            return params[0].new_tensor(0.0)
+        else:
+            return torch.tensor(0.0)
+
+    if len(grads) == 1:
+        total_norm = torch.norm(grads[0], p=2, dtype=torch.float32)
+    else:
+        if multi_tensor_l2norm_available:
+            total_norm = multi_tensor_total_norm(grads)
+        else:
+            if torch.cuda.is_available():
+                warnings.warn(
+                    "amp_C fused kernels unavailable, disabling multi_tensor_l2norm; "
+                    "you may get better performance by installing NVIDIA's apex library"
+                )
+                device = torch.cuda.current_device()
+            elif grads[0].device.type == "xla":
+                device = grads[0].device
+            else:
+                device = torch.device("cpu")
+            total_norm = torch.norm(
+                torch.stack(
+                    [torch.norm(g, p=2, dtype=torch.float32).to(device) for g in grads]
+                )
+            )
+
+    if aggregate_norm_fn is not None:
+        total_norm = aggregate_norm_fn(total_norm)
+
+    if max_norm > 0:
+        max_norm = float(max_norm)
+        clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1)
+        torch._foreach_mul_(grads + expert_grads, clip_coef)
+
+    return total_norm
+
+
+def fill_with_neg_inf(t):
+    """FP16-compatible function that fills a tensor with -inf."""
+    return t.float().fill_(float("-inf")).type_as(t)
+
+
+def _match_types(arg1, arg2):
+    """Convert the numerical argument to the same type as the other argument"""
+
+    def upgrade(arg_number, arg_structure):
+        if isinstance(arg_structure, tuple):
+            return tuple([arg_number] * len(arg_structure))
+        elif isinstance(arg_structure, dict):
+            arg = copy.deepcopy(arg_structure)
+            for k in arg:
+                arg[k] = upgrade(arg_number, arg_structure[k])
+            return arg
+        else:
+            return arg_number
+
+    if isinstance(arg1, float) or isinstance(arg1, int):
+        return upgrade(arg1, arg2), arg2
+    elif isinstance(arg2, float) or isinstance(arg2, int):
+        return arg1, upgrade(arg2, arg1)
+
+    return arg1, arg2
+
+
+def resolve_max_positions(*args):
+    """Resolve max position constraints from multiple sources."""
+
+    def map_value_update(d1, d2):
+        updated_value = copy.deepcopy(d1)
+        for key in d2:
+            if key not in updated_value:
+                updated_value[key] = d2[key]
+            else:
+                updated_value[key] = min(d1[key], d2[key])
+        return updated_value
+
+    def nullsafe_min(l):
+        minim = None
+        for item in l:
+            if minim is None:
+                minim = item
+            elif item is not None and item < minim:
+                minim = item
+        return minim
+
+    max_positions = None
+    for arg in args:
+        if max_positions is None:
+            max_positions = arg
+        elif arg is not None:
+            max_positions, arg = _match_types(max_positions, arg)
+            if isinstance(arg, float) or isinstance(arg, int):
+                max_positions = min(max_positions, arg)
+            elif isinstance(arg, dict):
+                max_positions = map_value_update(max_positions, arg)
+            else:
+                max_positions = tuple(map(nullsafe_min, zip(max_positions, arg)))
+
+    return max_positions
+
+
+def import_user_module(args):
+    module_path = getattr(args, "user_dir", None)
+    if module_path is not None:
+        module_path = os.path.abspath(args.user_dir)
+        if not os.path.exists(module_path) and not os.path.isfile(
+            os.path.dirname(module_path)
+        ):
+            fairseq_rel_path = os.path.join(os.path.dirname(__file__), args.user_dir)
+            if os.path.exists(fairseq_rel_path):
+                module_path = fairseq_rel_path
+            else:
+                fairseq_rel_path = os.path.join(
+                    os.path.dirname(__file__), "..", args.user_dir
+                )
+                if os.path.exists(fairseq_rel_path):
+                    module_path = fairseq_rel_path
+                else:
+                    raise FileNotFoundError(module_path)
+
+        # ensure that user modules are only imported once
+        import_user_module.memo = getattr(import_user_module, "memo", set())
+        if module_path not in import_user_module.memo:
+            import_user_module.memo.add(module_path)
+
+            module_parent, module_name = os.path.split(module_path)
+            if module_name not in sys.modules:
+                sys.path.insert(0, module_parent)
+                importlib.import_module(module_name)
+
+                tasks_path = os.path.join(module_path, "tasks")
+                if os.path.exists(tasks_path):
+                    from fairseq.tasks import import_tasks
+
+                    import_tasks(tasks_path, f"{module_name}.tasks")
+
+                models_path = os.path.join(module_path, "models")
+                if os.path.exists(models_path):
+                    from fairseq.models import import_models
+
+                    import_models(models_path, f"{module_name}.models")
+            elif module_path in sys.modules[module_name].__path__:
+                logger.info(f"--user-dir={module_path} has already been imported.")
+            else:
+                raise ImportError(
+                    "Failed to import --user-dir={} because the corresponding module name "
+                    "({}) is not globally unique. Please rename the directory to "
+                    "something unique and try again.".format(module_path, module_name)
+                )
+
+
+def softmax(x, dim: int, onnx_trace: bool = False):
+    if onnx_trace:
+        return F.softmax(x.float(), dim=dim)
+    else:
+        return F.softmax(x, dim=dim, dtype=torch.float32)
+
+
+def log_softmax(x, dim: int, onnx_trace: bool = False):
+    if onnx_trace:
+        return F.log_softmax(x.float(), dim=dim)
+    else:
+        return F.log_softmax(x, dim=dim, dtype=torch.float32)
+
+
+def get_perplexity(loss, round=2, base=2):
+    from fairseq.logging.meters import safe_round
+
+    if loss is None:
+        return 0.0
+    try:
+        return safe_round(base**loss, round)
+    except OverflowError:
+        return float("inf")
+
+
+def deprecation_warning(message, stacklevel=3):
+    # don't use DeprecationWarning, since it's ignored by default
+    warnings.warn(message, stacklevel=stacklevel)
+
+
+def relu_squared(x: torch.Tensor):
+    return F.relu(x).pow(2)
+
+
+def get_activation_fn(activation: str) -> Callable:
+    """Returns the activation function corresponding to `activation`"""
+    from fairseq.modules import gelu, gelu_accurate
+
+    if activation == "relu":
+        return F.relu
+    elif activation == "relu_squared":
+        return relu_squared
+    elif activation == "gelu":
+        return gelu
+    elif activation == "gelu_fast":
+        deprecation_warning(
+            "--activation-fn=gelu_fast has been renamed to gelu_accurate"
+        )
+        return gelu_accurate
+    elif activation == "gelu_accurate":
+        return gelu_accurate
+    elif activation == "tanh":
+        return torch.tanh
+    elif activation == "linear":
+        return lambda x: x
+    elif activation == "swish":
+        return torch.nn.SiLU
+    else:
+        raise RuntimeError("--activation-fn {} not supported".format(activation))
+
+
+def get_available_activation_fns() -> List:
+    return [
+        "relu",
+        "gelu",
+        "gelu_fast",  # deprecated
+        "gelu_accurate",
+        "tanh",
+        "linear",
+    ]
+
+
+@contextlib.contextmanager
+def model_eval(model):
+    is_training = model.training
+    model.eval()
+    yield
+    model.train(is_training)
+
+
+def has_parameters(module):
+    try:
+        next(module.parameters())
+        return True
+    except StopIteration:
+        return False
+
+
+def get_rng_state():
+    state = {"torch_rng_state": torch.get_rng_state()}
+    if xm is not None:
+        state["xla_rng_state"] = xm.get_rng_state()
+    if torch.cuda.is_available():
+        state["cuda_rng_state"] = torch.cuda.get_rng_state()
+    return state
+
+
+def set_rng_state(state):
+    torch.set_rng_state(state["torch_rng_state"])
+    if xm is not None:
+        xm.set_rng_state(state["xla_rng_state"])
+    if torch.cuda.is_available():
+        torch.cuda.set_rng_state(state["cuda_rng_state"])
+
+
+class set_torch_seed(object):
+    def __init__(self, seed):
+        assert isinstance(seed, int)
+        self.rng_state = get_rng_state()
+
+        torch.manual_seed(seed)
+        if xm is not None:
+            xm.set_rng_state(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        set_rng_state(self.rng_state)
+
+
+def parse_alignment(line):
+    """
+    Parses a single line from the alingment file.
+
+    Args:
+        line (str): String containing the alignment of the format:
+            <src_idx_1>-<tgt_idx_1> <src_idx_2>-<tgt_idx_2> ..
+            <src_idx_m>-<tgt_idx_m>. All indices are 0 indexed.
+
+    Returns:
+        torch.IntTensor: packed alignments of shape (2 * m).
+    """
+    alignments = line.strip().split()
+    parsed_alignment = torch.IntTensor(2 * len(alignments))
+    for idx, alignment in enumerate(alignments):
+        src_idx, tgt_idx = alignment.split("-")
+        parsed_alignment[2 * idx] = int(src_idx)
+        parsed_alignment[2 * idx + 1] = int(tgt_idx)
+    return parsed_alignment
+
+
+def get_token_to_word_mapping(tokens, exclude_list):
+    n = len(tokens)
+    word_start = [int(token not in exclude_list) for token in tokens]
+    word_idx = list(accumulate(word_start))
+    token_to_word = {i: word_idx[i] for i in range(n)}
+    return token_to_word
+
+
+def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
+    tgt_valid = (
+        ((tgt_sent != pad) & (tgt_sent != eos)).nonzero(as_tuple=False).squeeze(dim=-1)
+    )
+    src_invalid = (
+        ((src_sent == pad) | (src_sent == eos)).nonzero(as_tuple=False).squeeze(dim=-1)
+    )
+    src_token_to_word = get_token_to_word_mapping(src_sent, [eos, pad])
+    tgt_token_to_word = get_token_to_word_mapping(tgt_sent, [eos, pad])
+    alignment = []
+    if len(tgt_valid) != 0 and len(src_invalid) < len(src_sent):
+        attn_valid = attn[tgt_valid]
+        attn_valid[:, src_invalid] = float("-inf")
+        _, src_indices = attn_valid.max(dim=1)
+        for tgt_idx, src_idx in zip(tgt_valid, src_indices):
+            alignment.append(
+                (
+                    src_token_to_word[src_idx.item()] - 1,
+                    tgt_token_to_word[tgt_idx.item()] - 1,
+                )
+            )
+    return alignment
+
+
+def extract_soft_alignment(attn, src_sent, tgt_sent, pad, eos):
+    tgt_valid = ((tgt_sent != pad)).nonzero(as_tuple=False)
+    src_valid = ((src_sent != pad)).nonzero(as_tuple=False).squeeze(dim=-1)
+    alignment = []
+    if len(tgt_valid) != 0 and len(src_valid) != 0:
+        attn_valid = attn[tgt_valid, src_valid]
+        alignment = [
+            ["{:.6f}".format(p) for p in src_probs.tolist()] for src_probs in attn_valid
+        ]
+    return alignment
+
+
+def new_arange(x, *size):
+    """
+    Return a Tensor of `size` filled with a range function on the device of x.
+    If size is empty, using the size of the variable x.
+    """
+    if len(size) == 0:
+        size = x.size()
+    return torch.arange(size[-1], device=x.device).expand(*size).contiguous()
+
+
+def get_tpu_device():
+    return xm.xla_device()
+
+
+def tpu_data_loader(itr):
+    import torch_xla.core.xla_model as xm
+    import torch_xla.distributed.parallel_loader as pl
+
+    from fairseq.data import iterators
+
+    xm.rendezvous("tpu_data_loader")  # wait for all workers
+    xm.mark_step()
+    device = xm.xla_device()
+    return iterators.CountingIterator(
+        pl.ParallelLoader(itr, [device]).per_device_loader(device),
+        start=getattr(itr, "n", 0),
+        total=len(itr),
+    )
+
+
+def is_xla_tensor(tensor):
+    return torch.is_tensor(tensor) and tensor.device.type == "xla"
+
+
+def index_put(tensor, indices, value):
+    if is_xla_tensor(tensor):
+        for _ in range(indices.dim(), tensor.dim()):
+            indices = indices.unsqueeze(-1)
+        if indices.size(-1) < tensor.size(-1):
+            indices = indices.expand_as(tensor)
+        tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices)
+    else:
+        tensor[indices] = value
+    return tensor
+
+
+def xla_device_to_cpu(dat):
+    import torch_xla.core.xla_model as xm
+
+    return xm._maybe_convert_to_cpu(dat)
+
+
+class CudaEnvironment(object):
+    def __init__(self):
+        cur_device = torch.cuda.current_device()
+        prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device))
+        self.name = prop.name
+        self.major = prop.major
+        self.minor = prop.minor
+        self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024
+
+    @staticmethod
+    def pretty_print_cuda_env_list(cuda_env_list):
+        """
+        Given a list of CudaEnviorments, pretty print them
+        """
+        num_workers = len(cuda_env_list)
+        center = "CUDA enviroments for all {} workers".format(num_workers)
+        banner_len = 40 - len(center) // 2
+        first_line = "*" * banner_len + center + "*" * banner_len
+        logger.info(first_line)
+        for r, env in enumerate(cuda_env_list):
+            logger.info(
+                "rank {:3d}: ".format(r)
+                + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor)
+                + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB)
+                + "name = {:40s}".format(env.name)
+            )
+        logger.info(first_line)
+
+
+def csv_str_list(x):
+    return x.split(",")
+
+
+def eval_str_list(x, type=float):
+    if x is None:
+        return None
+    if isinstance(x, str):
+        x = eval(x)
+    try:
+        return list(map(type, x))
+    except TypeError:
+        return [type(x)]
+
+
+def eval_str_dict(x, type=dict):
+    if x is None:
+        return None
+    if isinstance(x, str):
+        x = eval(x)
+    return x
+
+
+def eval_bool(x, default=False):
+    if x is None:
+        return default
+    try:
+        return bool(eval(x))
+    except TypeError:
+        return default
+
+
+def reset_logging():
+    root = logging.getLogger()
+    for handler in root.handlers:
+        root.removeHandler(handler)
+    root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper())
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(
+        logging.Formatter(
+            fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    )
+    root.addHandler(handler)
+
+
+def safe_getattr(obj, k, default=None):
+    """Returns obj[k] if it exists and is not None, otherwise returns default."""
+    from omegaconf import OmegaConf
+
+    if OmegaConf.is_config(obj):
+        return obj[k] if k in obj and obj[k] is not None else default
+
+    return getattr(obj, k, default)
+
+
+def safe_hasattr(obj, k):
+    """Returns True if the given key exists and is not None."""
+    return getattr(obj, k, None) is not None
+
+
+def hotreload_function(name=None):
+    """
+    Decorator to function to enable hot-reload for debugging.
+    It allows you to debug a function without having reloading all heavy models, dataset loading and
+        preprocessing, allow faster debugging.
+    If you want to change model or dataset loading, consider relaunching your code
+    -----------------------------------
+    This will run the decorated function func:
+        if func run successful:
+            It will pause, allow user to edit code, and prompt user to:
+                Press enter to re-run the function with updated code
+                Type "done" to finish the function, return output
+                Type "disable" to stop pausing this function and let code continue without pause
+                Ctril + C to terminal
+        if func raise error:
+            it will prompt user to
+                1. Edit code, and press enter to retry
+                2. Ctrl + C to terminate
+                3. Type "raise" to raise that exception
+    * Requirements:
+        0. Fairseq was installed with `pip install --editable .`
+        1. pip install jurigged[develoop]
+        2. set environment HOTRELOAD_PAUSE=1 CUDA_LAUNCH_BLOCKING=1
+        3. Run on only 1 GPU (no distributed)
+    * How to use:
+        1. in python, import and decorate the top-level function to be re-run after code edits:
+            ```python
+            from fairseq.utils import hotreload_function
+            ....
+            @hotreload_function("train_step")
+            def train_step(self, sample ....):
+                ....
+            ....
+            ```
+        2. in bash run scripts:
+            ```bash
+            watch_dir=<home>/fairseq-py/fairseq/tasks # directory to watch for file changes
+            export CUDA_VISIBLE_DEVICES=0 # single-gpu
+            HOTRELOAD_PAUSE=1 CUDA_LAUNCH_BLOCKING=1 python -m jurigged -w ${watch_dir} --poll 2 -v train.py ......
+            ```
+    * NOTE:
+        1. -w ${watch_dir} specify all the files to be watched for changes
+            once functions, class, ... code are changed, all instances in the process will get updated (hot-reload)
+    * Limitation:
+        * Currently distributed debugging not working
+        * Need to launch train.py locally (cannot submit jobs)
+    """
+    try:
+        import jurigged
+    except ImportError as e:
+        logger.warning("Please install jurigged: pip install jurigged[develoop]")
+        raise e
+    from fairseq.distributed import utils as distributed_utils
+    import traceback
+
+    def hotreload_decorator(func):
+        assert callable(func), f"not callable: {func}"
+        jname = name or func.__name__
+        logger.info(f"jurigged-hotreload:Apply jurigged on {jname}:{func.__name__}")
+        HOTRELOAD_PAUSE = bool(os.environ.get("HOTRELOAD_PAUSE", 0))
+        cublk = bool(os.environ.get("CUDA_LAUNCH_BLOCKING", 0))
+        prefix = f"HOTRELOAD:{jname}:[cublk={cublk}]"
+        hot_reload_state = {"disable": False}
+
+        def func_wrapper(*args, **kwargs):
+            if not HOTRELOAD_PAUSE or hot_reload_state["disable"]:
+                return func(*args, **kwargs)
+            world_size = distributed_utils.get_global_world_size()
+            assert (
+                world_size <= 1
+            ), f"HOTRELOAD_PAUSE:{jname} currently cannot do distributed training"
+            success = False
+            while not success:
+                try:
+                    output = func(*args, **kwargs)
+                    # success = True
+                    end_action = input(
+                        f"{prefix}: PAUSE, you may edit code now. Enter to re-run, ctrl+C to terminate, "
+                        f'type "done" to continue (function still being watched), or type "disable" to stop pausing this function :'
+                    )
+                    if end_action.strip().lower() in ["disable", "done"]:
+                        success = True
+                    else:
+                        logger.warning(
+                            f"{prefix}: action={end_action} function will re-run now."
+                        )
+                except Exception as e:
+                    action = input(
+                        f"{prefix}:ERROR: \n{traceback.format_exc()}\n"
+                        f'Edit code to try again: enter to continue, ctrl+C to terminate, or type "raise" to raise the exception: '
+                    )
+                    if action.strip().lower() == "raise":
+                        raise e
+
+            if end_action.strip().lower() == "disable":
+                logger.warning(
+                    f"{prefix}: Stop pausing {jname}. The function is still being watched and newly editted code will take effect "
+                    f"if the {jname} is called again later."
+                    f' "unset HOTRELOAD_PAUSE" before relaunch to disable hotreload and'
+                    f" remove @hotreload_function decorator in the code."
+                )
+                hot_reload_state["disable"] = True
+            return output
+
+        return func_wrapper
+
+    return hotreload_decorator
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/version.txt b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/version.txt
new file mode 100644
index 0000000000..26acbf080b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/version.txt
@@ -0,0 +1 @@
+0.12.2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
new file mode 100644
index 0000000000..dbd1450a9e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Evaluate the perplexity of a trained language model.
+"""
+
+import logging
+import math
+import os
+import sys
+from argparse import Namespace
+from typing import Iterable, List, Optional
+
+import torch
+from omegaconf import DictConfig
+
+import fairseq
+from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import progress_bar
+from fairseq.logging.meters import StopwatchMeter
+from fairseq.sequence_scorer import SequenceScorer
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.eval_lm")
+
+
+def eval_lm(
+    models: List[fairseq.models.FairseqModel],
+    source_dictionary: fairseq.data.Dictionary,
+    batch_iterator: Iterable,
+    post_process: Optional[str] = None,
+    output_word_probs: bool = False,
+    output_word_stats: bool = False,
+    target_dictionary: Optional[fairseq.data.Dictionary] = None,
+    softmax_batch: int = 0,
+    remove_bos_token: bool = False,
+    device: Optional[torch.device] = None,
+):
+    """
+    Args:
+        models (List[~fairseq.models.FairseqModel]): list of models to
+            evaluate. Models are essentially `nn.Module` instances, but
+            must be compatible with fairseq's `SequenceScorer`.
+        source_dictionary (~fairseq.data.Dictionary): dictionary for
+            applying any relevant post processing or outputing word
+            probs/stats.
+        batch_iterator (Iterable): yield batches of data
+        post_process (Optional[str]): post-process text by removing BPE,
+            letter segmentation, etc. Valid options can be found in
+            fairseq.data.utils.post_process, although not all options
+            are implemented here.
+        output_word_probs (Optional[bool]): output words and their
+            predicted log probabilities
+        output_word_stats (Optional[bool]): output word statistics such
+            as word count and average probability
+        target_dictionary (Optional[~fairseq.data.Dictionary]): output
+            dictionary (defaults to *source_dictionary*)
+        softmax_batch (Optional[bool]): if BxT is more than this, will
+            batch the softmax over vocab to this amount of tokens, in
+            order to fit into GPU memory
+        remove_bos_token (Optional[bool]): if True, confirm that the
+            first token is the beginning-of-sentence symbol (according
+            to the relevant dictionary) and remove it from the output
+        device (Optional[torch.device]): device to use for evaluation
+            (defaults to device of first model parameter)
+    """
+    if target_dictionary is None:
+        target_dictionary = source_dictionary
+    if device is None:
+        device = next(models[0].parameters()).device
+
+    gen_timer = StopwatchMeter()
+    scorer = SequenceScorer(target_dictionary, softmax_batch)
+
+    score_sum = 0.0
+    count = 0
+
+    if post_process is not None:
+        if post_process in {"subword_nmt", "@@ "}:
+            bpe_cont = post_process.rstrip()
+            bpe_toks = {
+                i
+                for i in range(len(source_dictionary))
+                if source_dictionary[i].endswith(bpe_cont)
+            }
+        else:
+            raise NotImplementedError(
+                f"--post-process={post_process} is not implemented"
+            )
+        bpe_len = len(bpe_cont)
+    else:
+        bpe_toks = None
+        bpe_len = 0
+
+    word_stats = dict()
+
+    for sample in batch_iterator:
+        if "net_input" not in sample:
+            continue
+
+        sample = utils.move_to_cuda(sample, device=device)
+
+        gen_timer.start()
+        hypos = scorer.generate(models, sample)
+        gen_timer.stop(sample["ntokens"])
+
+        for i, hypos_i in enumerate(hypos):
+            hypo = hypos_i[0]
+            sample_id = sample["id"][i]
+
+            tokens = hypo["tokens"]
+            tgt_len = tokens.numel()
+            pos_scores = hypo["positional_scores"].float()
+
+            if remove_bos_token:
+                assert hypo["tokens"][0].item() == target_dictionary.bos()
+                tokens = tokens[1:]
+                pos_scores = pos_scores[1:]
+
+            skipped_toks = 0
+            if bpe_toks is not None:
+                for i in range(tgt_len - 1):
+                    if tokens[i].item() in bpe_toks:
+                        skipped_toks += 1
+                        pos_scores[i + 1] += pos_scores[i]
+                        pos_scores[i] = 0
+
+            inf_scores = pos_scores.eq(float("inf")) | pos_scores.eq(float("-inf"))
+            if inf_scores.any():
+                logger.info(
+                    "skipping tokens with inf scores:",
+                    target_dictionary.string(tokens[inf_scores.nonzero()]),
+                )
+                pos_scores = pos_scores[(~inf_scores).nonzero()]
+            score_sum += pos_scores.sum().cpu()
+            count += pos_scores.numel() - skipped_toks
+
+            if output_word_probs or output_word_stats:
+                w = ""
+                word_prob = []
+                is_bpe = False
+                for i in range(len(tokens)):
+                    w_ind = tokens[i].item()
+                    w += source_dictionary[w_ind]
+                    if bpe_toks is not None and w_ind in bpe_toks:
+                        w = w[:-bpe_len]
+                        is_bpe = True
+                    else:
+                        word_prob.append((w, pos_scores[i].item()))
+
+                        next_prob = None
+                        ind = i + 1
+                        while ind < len(tokens):
+                            if pos_scores[ind].item() != 0:
+                                next_prob = pos_scores[ind]
+                                break
+                            ind += 1
+
+                        word_stats.setdefault(w, WordStat(w, is_bpe)).add(
+                            pos_scores[i].item(), next_prob
+                        )
+                        is_bpe = False
+                        w = ""
+                if output_word_probs:
+                    logger.info(
+                        str(int(sample_id))
+                        + " "
+                        + (
+                            "\t".join(
+                                "{} [{:2f}]".format(x[0], x[1]) for x in word_prob
+                            )
+                        )
+                    )
+
+    avg_nll_loss = (
+        -score_sum / count / math.log(2) if count > 0 else 0
+    )  # convert to base 2
+    logger.info(
+        "Evaluated {:,} tokens in {:.1f}s ({:.2f} tokens/s)".format(
+            gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg if gen_timer.avg > 0 else 0
+        )
+    )
+
+    if output_word_stats:
+        for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True):
+            logger.info(ws)
+
+    return {
+        "loss": avg_nll_loss,
+        "perplexity": 2**avg_nll_loss,
+    }
+
+
+class WordStat(object):
+    def __init__(self, word, is_bpe):
+        self.word = word
+        self.is_bpe = is_bpe
+        self.log_prob = 0
+        self.next_word_prob = 0
+        self.count = 0
+        self.missing_next_words = 0
+
+    def add(self, log_prob, next_word_prob):
+        """increments counters for the sum of log probs of current word and next
+        word (given context ending at current word). Since the next word might be at the end of the example,
+        or it might be not counted because it is not an ending subword unit,
+        also keeps track of how many of those we have seen"""
+        if next_word_prob is not None:
+            self.next_word_prob += next_word_prob
+        else:
+            self.missing_next_words += 1
+        self.log_prob += log_prob
+        self.count += 1
+
+    def __str__(self):
+        return "{}\t{}\t{}\t{}\t{}\t{}".format(
+            self.word,
+            self.count,
+            self.log_prob,
+            self.is_bpe,
+            self.next_word_prob,
+            self.count - self.missing_next_words,
+        )
+
+
+def main(cfg: DictConfig, **unused_kwargs):
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    utils.import_user_module(cfg.common)
+
+    logger.info(cfg)
+
+    if cfg.eval_lm.context_window > 0:
+        # reduce tokens per sample by the required context window size
+        cfg.task.tokens_per_sample -= cfg.eval_lm.context_window
+
+    # Initialize the task using the current *cfg*
+    task = tasks.setup_task(cfg.task)
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, model_args, task = checkpoint_utils.load_model_ensemble_and_task(
+        [cfg.common_eval.path],
+        arg_overrides=eval(cfg.common_eval.model_overrides),
+        suffix=cfg.checkpoint.checkpoint_suffix,
+        strict=(cfg.checkpoint.checkpoint_shard_count == 1),
+        num_shards=cfg.checkpoint.checkpoint_shard_count,
+        task=task,
+    )
+
+    use_fp16 = cfg.common.fp16
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+    if use_cuda:
+        torch.cuda.set_device(cfg.distributed_training.device_id)
+
+    # Optimize ensemble for generation and set the source and dest dicts on the model
+    # (required by scorer)
+    for model in models:
+        if use_fp16:
+            model.half()
+        if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(cfg)
+
+    assert len(models) > 0
+
+    logger.info(
+        "num. model params: {:,}".format(sum(p.numel() for p in models[0].parameters()))
+    )
+
+    # Load dataset splits
+    task.load_dataset(cfg.dataset.gen_subset)
+    dataset = task.dataset(cfg.dataset.gen_subset)
+    logger.info(
+        "{} {} {:,} examples".format(
+            cfg.task.data, cfg.dataset.gen_subset, len(dataset)
+        )
+    )
+
+    itr = task.eval_lm_dataloader(
+        dataset=dataset,
+        max_tokens=cfg.dataset.max_tokens or 36000,
+        batch_size=cfg.dataset.batch_size,
+        max_positions=utils.resolve_max_positions(
+            *[model.max_positions() for model in models]
+        ),
+        num_shards=max(
+            cfg.dataset.num_shards,
+            cfg.distributed_training.distributed_world_size,
+        ),
+        shard_id=max(
+            cfg.dataset.shard_id,
+            cfg.distributed_training.distributed_rank,
+        ),
+        num_workers=cfg.dataset.num_workers,
+        data_buffer_size=cfg.dataset.data_buffer_size,
+        context_window=cfg.eval_lm.context_window,
+    )
+
+    itr = progress_bar.progress_bar(
+        itr,
+        log_format=cfg.common.log_format,
+        log_interval=cfg.common.log_interval,
+        default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+    )
+
+    results = eval_lm(
+        models=models,
+        source_dictionary=task.source_dictionary,
+        batch_iterator=itr,
+        post_process=cfg.common_eval.post_process,
+        output_word_probs=cfg.eval_lm.output_word_probs,
+        output_word_stats=cfg.eval_lm.output_word_stats,
+        target_dictionary=task.target_dictionary,
+        softmax_batch=cfg.eval_lm.softmax_batch,
+        remove_bos_token=getattr(cfg.task, "add_bos_token", False),
+    )
+
+    logger.info(
+        "Loss (base 2): {:.4f}, Perplexity: {:.2f}".format(
+            results["loss"], results["perplexity"]
+        )
+    )
+
+    return results
+
+
+def cli_main():
+    parser = options.get_eval_lm_parser()
+    args = options.parse_args_and_arch(parser)
+
+    distributed_utils.call_main(convert_namespace_to_omegaconf(args), main)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
new file mode 100644
index 0000000000..b8757835d4
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Translate pre-processed data with a trained model.
+"""
+
+import ast
+import logging
+import math
+import os
+import sys
+from argparse import Namespace
+from itertools import chain
+
+import numpy as np
+import torch
+from omegaconf import DictConfig
+
+from fairseq import checkpoint_utils, options, scoring, tasks, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import progress_bar
+from fairseq.logging.meters import StopwatchMeter, TimeMeter
+
+
+def main(cfg: DictConfig):
+
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    assert cfg.common_eval.path is not None, "--path required for generation!"
+    assert (
+        not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        cfg.generation.replace_unk is None or cfg.dataset.dataset_impl == "raw"
+    ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)"
+
+    if cfg.common_eval.results_path is not None:
+        os.makedirs(cfg.common_eval.results_path, exist_ok=True)
+        output_path = os.path.join(
+            cfg.common_eval.results_path,
+            "generate-{}.txt".format(cfg.dataset.gen_subset),
+        )
+        with open(output_path, "w", buffering=1, encoding="utf-8") as h:
+            return _main(cfg, h)
+    else:
+        return _main(cfg, sys.stdout)
+
+
+def get_symbols_to_strip_from_output(generator):
+    if hasattr(generator, "symbols_to_strip_from_output"):
+        return generator.symbols_to_strip_from_output
+    else:
+        return {generator.eos}
+
+
+def _main(cfg: DictConfig, output_file):
+    logging.basicConfig(
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=os.environ.get("LOGLEVEL", "INFO").upper(),
+        stream=output_file,
+    )
+    logger = logging.getLogger("fairseq_cli.generate")
+
+    utils.import_user_module(cfg.common)
+
+    if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None:
+        cfg.dataset.max_tokens = 12000
+    logger.info(cfg)
+
+    # Fix seed for stochastic decoding
+    if cfg.common.seed is not None and not cfg.generation.no_seed_provided:
+        np.random.seed(cfg.common.seed)
+        utils.set_torch_seed(cfg.common.seed)
+
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(cfg.task)
+
+    # Set dictionaries
+    try:
+        src_dict = getattr(task, "source_dictionary", None)
+    except NotImplementedError:
+        src_dict = None
+    tgt_dict = task.target_dictionary
+
+    overrides = ast.literal_eval(cfg.common_eval.model_overrides)
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, saved_cfg = checkpoint_utils.load_model_ensemble(
+        utils.split_paths(cfg.common_eval.path),
+        arg_overrides=overrides,
+        task=task,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+        strict=(cfg.checkpoint.checkpoint_shard_count == 1),
+        num_shards=cfg.checkpoint.checkpoint_shard_count,
+    )
+
+    # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config
+    task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task)
+
+    if cfg.generation.lm_path is not None:
+        overrides["data"] = cfg.task.data
+
+        try:
+            lms, _ = checkpoint_utils.load_model_ensemble(
+                [cfg.generation.lm_path], arg_overrides=overrides, task=None
+            )
+        except:
+            logger.warning(
+                f"Failed to load language model! Please make sure that the language model dict is the same "
+                f"as target dict and is located in the data dir ({cfg.task.data})"
+            )
+            raise
+
+        assert len(lms) == 1
+    else:
+        lms = [None]
+
+    # Optimize ensemble for generation
+    for model in chain(models, lms):
+        if model is None:
+            continue
+        if cfg.common.fp16:
+            model.half()
+        if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(cfg)
+
+    # Load alignment dictionary for unknown word replacement
+    # (None if no unknown word replacement, empty if no path to align dictionary)
+    align_dict = utils.load_align_dict(cfg.generation.replace_unk)
+
+    # Load dataset (possibly sharded)
+    itr = task.get_batch_iterator(
+        dataset=task.dataset(cfg.dataset.gen_subset),
+        max_tokens=cfg.dataset.max_tokens,
+        max_sentences=cfg.dataset.batch_size,
+        max_positions=utils.resolve_max_positions(
+            task.max_positions(), *[m.max_positions() for m in models]
+        ),
+        ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=cfg.dataset.required_batch_size_multiple,
+        seed=cfg.common.seed,
+        num_shards=cfg.distributed_training.distributed_world_size,
+        shard_id=cfg.distributed_training.distributed_rank,
+        num_workers=cfg.dataset.num_workers,
+        data_buffer_size=cfg.dataset.data_buffer_size,
+    ).next_epoch_itr(shuffle=False)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=cfg.common.log_format,
+        log_interval=cfg.common.log_interval,
+        default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+    )
+
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+
+    extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": cfg.generation.lm_weight}
+    generator = task.build_generator(
+        models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs
+    )
+
+    # Handle tokenization and BPE
+    tokenizer = task.build_tokenizer(cfg.tokenizer)
+    bpe = task.build_bpe(cfg.bpe)
+
+    def decode_fn(x):
+        if bpe is not None:
+            x = bpe.decode(x)
+        if tokenizer is not None:
+            x = tokenizer.decode(x)
+        return x
+
+    scorer = scoring.build_scorer(cfg.scoring, tgt_dict)
+
+    num_sentences = 0
+    has_target = True
+    wps_meter = TimeMeter()
+    for sample in progress:
+        sample = utils.move_to_cuda(sample) if use_cuda else sample
+        if "net_input" not in sample:
+            continue
+
+        prefix_tokens = None
+        if cfg.generation.prefix_size > 0:
+            prefix_tokens = sample["target"][:, : cfg.generation.prefix_size]
+
+        constraints = None
+        if "constraints" in sample:
+            constraints = sample["constraints"]
+
+        gen_timer.start()
+        hypos = task.inference_step(
+            generator,
+            models,
+            sample,
+            prefix_tokens=prefix_tokens,
+            constraints=constraints,
+        )
+        num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
+        gen_timer.stop(num_generated_tokens)
+
+        for i, sample_id in enumerate(sample["id"].tolist()):
+            has_target = sample["target"] is not None
+
+            # Remove padding
+            if "src_tokens" in sample["net_input"]:
+                src_tokens = utils.strip_pad(
+                    sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()
+                )
+            else:
+                src_tokens = None
+
+            target_tokens = None
+            if has_target:
+                target_tokens = (
+                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
+                )
+
+            # Either retrieve the original sentences or regenerate them from tokens.
+            if align_dict is not None:
+                src_str = task.dataset(cfg.dataset.gen_subset).src.get_original_text(
+                    sample_id
+                )
+                target_str = task.dataset(cfg.dataset.gen_subset).tgt.get_original_text(
+                    sample_id
+                )
+            else:
+                if src_dict is not None:
+                    src_str = src_dict.string(src_tokens, cfg.common_eval.post_process)
+                else:
+                    src_str = ""
+                if has_target:
+                    target_str = tgt_dict.string(
+                        target_tokens,
+                        cfg.common_eval.post_process,
+                        escape_unk=True,
+                        extra_symbols_to_ignore=get_symbols_to_strip_from_output(
+                            generator
+                        ),
+                    )
+
+            src_str = decode_fn(src_str)
+            if has_target:
+                target_str = decode_fn(target_str)
+
+            if not cfg.common_eval.quiet:
+                if src_dict is not None:
+                    print("S-{}\t{}".format(sample_id, src_str), file=output_file)
+                if has_target:
+                    print("T-{}\t{}".format(sample_id, target_str), file=output_file)
+
+            # Process top predictions
+            for j, hypo in enumerate(hypos[i][: cfg.generation.nbest]):
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                    hypo_tokens=hypo["tokens"].int().cpu(),
+                    src_str=src_str,
+                    alignment=hypo["alignment"],
+                    align_dict=align_dict,
+                    tgt_dict=tgt_dict,
+                    remove_bpe=cfg.common_eval.post_process,
+                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
+                )
+                detok_hypo_str = decode_fn(hypo_str)
+                if not cfg.common_eval.quiet:
+                    score = hypo["score"] / math.log(2)  # convert to base 2
+                    # original hypothesis (after tokenization and BPE)
+                    print(
+                        "H-{}\t{}\t{}".format(sample_id, score, hypo_str),
+                        file=output_file,
+                    )
+                    # detokenized hypothesis
+                    print(
+                        "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str),
+                        file=output_file,
+                    )
+                    print(
+                        "P-{}\t{}".format(
+                            sample_id,
+                            " ".join(
+                                map(
+                                    lambda x: "{:.4f}".format(x),
+                                    # convert from base e to base 2
+                                    hypo["positional_scores"]
+                                    .div_(math.log(2))
+                                    .tolist(),
+                                )
+                            ),
+                        ),
+                        file=output_file,
+                    )
+
+                    if cfg.generation.print_alignment == "hard":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [
+                                        "{}-{}".format(src_idx, tgt_idx)
+                                        for src_idx, tgt_idx in alignment
+                                    ]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+                    if cfg.generation.print_alignment == "soft":
+                        print(
+                            "A-{}\t{}".format(
+                                sample_id,
+                                " ".join(
+                                    [",".join(src_probs) for src_probs in alignment]
+                                ),
+                            ),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.print_step:
+                        print(
+                            "I-{}\t{}".format(sample_id, hypo["steps"]),
+                            file=output_file,
+                        )
+
+                    if cfg.generation.retain_iter_history:
+                        for step, h in enumerate(hypo["history"]):
+                            _, h_str, _ = utils.post_process_prediction(
+                                hypo_tokens=h["tokens"].int().cpu(),
+                                src_str=src_str,
+                                alignment=None,
+                                align_dict=None,
+                                tgt_dict=tgt_dict,
+                                remove_bpe=None,
+                            )
+                            print(
+                                "E-{}_{}\t{}".format(sample_id, step, h_str),
+                                file=output_file,
+                            )
+
+                # Score only the top hypothesis
+                if has_target and j == 0:
+                    if (
+                        align_dict is not None
+                        or cfg.common_eval.post_process is not None
+                    ):
+                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
+                        target_tokens = tgt_dict.encode_line(
+                            target_str, add_if_not_exist=True
+                        )
+                        hypo_tokens = tgt_dict.encode_line(
+                            detok_hypo_str, add_if_not_exist=True
+                        )
+                    if hasattr(scorer, "add_string"):
+                        scorer.add_string(target_str, detok_hypo_str)
+                    else:
+                        scorer.add(target_tokens, hypo_tokens)
+
+        wps_meter.update(num_generated_tokens)
+        progress.log({"wps": round(wps_meter.avg)})
+        num_sentences += (
+            sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
+        )
+
+    logger.info("NOTE: hypothesis and token scores are output in base 2")
+    logger.info(
+        "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format(
+            num_sentences,
+            gen_timer.n,
+            gen_timer.sum,
+            num_sentences / gen_timer.sum,
+            1.0 / gen_timer.avg,
+        )
+    )
+    if has_target:
+        if cfg.bpe and not cfg.generation.sacrebleu:
+            if cfg.common_eval.post_process:
+                logger.warning(
+                    "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+            else:
+                logger.warning(
+                    "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization"
+                )
+        # use print to be consistent with other main outputs: S-, H-, T-, D- and so on
+        print(
+            "Generate {} with beam={}: {}".format(
+                cfg.dataset.gen_subset, cfg.generation.beam, scorer.result_string()
+            ),
+            file=output_file,
+        )
+
+    return scorer
+
+
+def cli_main():
+    parser = options.get_generation_parser()
+    # TODO: replace this workaround with refactoring of `AudioPretraining`
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="wav2vec2",
+        help="Model architecture. For constructing tasks that rely on "
+        "model args (e.g. `AudioPretraining`)",
+    )
+    args = options.parse_args_and_arch(parser)
+    main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
new file mode 100644
index 0000000000..607340af0d
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+import hydra
+import torch
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import OmegaConf, open_dict
+
+from fairseq import distributed_utils, metrics
+from fairseq.dataclass.configs import FairseqConfig
+from fairseq.dataclass.initialize import add_defaults, hydra_init
+from fairseq.dataclass.utils import omegaconf_no_object_check
+from fairseq.utils import reset_logging
+from fairseq_cli.train import main as pre_main
+
+logger = logging.getLogger("fairseq_cli.hydra_train")
+
+
+@hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config")
+def hydra_main(cfg: FairseqConfig) -> float:
+    _hydra_main(cfg)
+
+
+def _hydra_main(cfg: FairseqConfig, **kwargs) -> float:
+    add_defaults(cfg)
+
+    if cfg.common.reset_logging:
+        reset_logging()  # Hydra hijacks logging, fix that
+    else:
+        # check if directly called or called through hydra_main
+        if HydraConfig.initialized():
+            with open_dict(cfg):
+                # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126)
+                cfg.job_logging_cfg = OmegaConf.to_container(
+                    HydraConfig.get().job_logging, resolve=True
+                )
+
+    with omegaconf_no_object_check():
+        cfg = OmegaConf.create(
+            OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)
+        )
+    OmegaConf.set_struct(cfg, True)
+
+    try:
+        if cfg.common.profile:
+            with torch.cuda.profiler.profile():
+                with torch.autograd.profiler.emit_nvtx():
+                    distributed_utils.call_main(cfg, pre_main, **kwargs)
+        else:
+            distributed_utils.call_main(cfg, pre_main, **kwargs)
+    except BaseException as e:
+        if not cfg.common.suppress_crashes:
+            raise
+        else:
+            logger.error("Crashed! " + str(e))
+
+    # get best val and return - useful for sweepers
+    try:
+        best_val = metrics.get_smoothed_value(
+            "valid", cfg.checkpoint.best_checkpoint_metric
+        )
+    except:
+        best_val = None
+
+    if best_val is None:
+        best_val = float("inf")
+
+    return best_val
+
+
+def cli_main():
+    try:
+        from hydra._internal.utils import get_args
+
+        cfg_name = get_args().config_name or "config"
+    except:
+        logger.warning("Failed to get config name from hydra args")
+        cfg_name = "config"
+
+    hydra_init(cfg_name)
+    hydra_main()
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
new file mode 100644
index 0000000000..cb6f7612d0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+from itertools import chain
+
+import torch
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import OmegaConf, open_dict
+import hydra
+
+from fairseq import checkpoint_utils, distributed_utils, utils
+from fairseq.dataclass.configs import FairseqConfig
+from fairseq.dataclass.initialize import add_defaults, hydra_init
+from fairseq.dataclass.utils import omegaconf_no_object_check
+from fairseq.distributed import utils as distributed_utils
+from fairseq.logging import metrics, progress_bar
+from fairseq.utils import reset_logging
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.validate")
+
+
+@hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config")
+def hydra_main(cfg: FairseqConfig) -> float:
+    return _hydra_main(cfg)
+
+
+def _hydra_main(cfg: FairseqConfig, **kwargs) -> float:
+    add_defaults(cfg)
+
+    if cfg.common.reset_logging:
+        reset_logging()  # Hydra hijacks logging, fix that
+    else:
+        # check if directly called or called through hydra_main
+        if HydraConfig.initialized():
+            with open_dict(cfg):
+                # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126)
+                cfg.job_logging_cfg = OmegaConf.to_container(
+                    HydraConfig.get().job_logging, resolve=True
+                )
+
+    with omegaconf_no_object_check():
+        cfg = OmegaConf.create(
+            OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)
+        )
+    OmegaConf.set_struct(cfg, True)
+
+    assert (
+        cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None
+    ), "Must specify batch size either with --max-tokens or --batch-size"
+
+    distributed_utils.call_main(cfg, validate, **kwargs)
+
+
+def validate(cfg):
+    utils.import_user_module(cfg.common)
+
+    use_fp16 = cfg.common.fp16
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    if use_cuda:
+        torch.cuda.set_device(cfg.distributed_training.device_id)
+
+    if cfg.distributed_training.distributed_world_size > 1:
+        data_parallel_world_size = distributed_utils.get_data_parallel_world_size()
+        data_parallel_rank = distributed_utils.get_data_parallel_rank()
+    else:
+        data_parallel_world_size = 1
+        data_parallel_rank = 0
+
+    overrides = {"task": {"data": cfg.task.data}}
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [cfg.common_eval.path],
+        arg_overrides=overrides,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+    )
+    model = models[0]
+
+    # Move models to GPU
+    for model in models:
+        model.eval()
+        if use_fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+
+    # Print args
+    logger.info(saved_cfg)
+
+    # Build criterion
+    criterion = task.build_criterion(saved_cfg.criterion, from_checkpoint=True)
+    criterion.eval()
+
+    for subset in cfg.dataset.valid_subset.split(","):
+        try:
+            task.load_dataset(subset, combine=False, epoch=1, task_cfg=saved_cfg.task)
+            dataset = task.dataset(subset)
+        except KeyError:
+            raise Exception("Cannot find dataset: " + subset)
+
+        # Initialize data iterator
+        itr = task.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=cfg.dataset.max_tokens,
+            max_sentences=cfg.dataset.batch_size,
+            max_positions=utils.resolve_max_positions(
+                task.max_positions(),
+                *[m.max_positions() for m in models],
+            ),
+            ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=cfg.dataset.required_batch_size_multiple,
+            seed=cfg.common.seed,
+            num_shards=data_parallel_world_size,
+            shard_id=data_parallel_rank,
+            num_workers=cfg.dataset.num_workers,
+            data_buffer_size=cfg.dataset.data_buffer_size,
+        ).next_epoch_itr(shuffle=False)
+        progress = progress_bar.progress_bar(
+            itr,
+            log_format=cfg.common.log_format,
+            log_interval=cfg.common.log_interval,
+            prefix=f"valid on '{subset}' subset",
+            default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+        )
+
+        def apply_half(t):
+            if t.dtype is torch.float32:
+                return t.to(dtype=torch.half)
+            return t
+
+        log_outputs = []
+        for i, sample in enumerate(progress):
+            sample = utils.move_to_cuda(sample) if use_cuda else sample
+
+            if use_fp16:
+                sample = utils.apply_to_sample(apply_half, sample)
+
+            _loss, _sample_size, log_output = task.valid_step(sample, model, criterion)
+            with metrics.aggregate() as agg:
+                task.reduce_metrics([log_output], criterion)
+                progress.log(agg.get_smoothed_values(), step=i)
+            # progress.log(log_output, step=i) from vision
+            log_outputs.append(log_output)
+
+        if data_parallel_world_size > 1:
+            log_outputs = distributed_utils.all_gather_list(
+                log_outputs,
+                max_size=cfg.common.all_gather_list_size,
+                group=distributed_utils.get_data_parallel_group(),
+            )
+            log_outputs = list(chain.from_iterable(log_outputs))
+
+        with metrics.aggregate() as agg:
+            task.reduce_metrics(log_outputs, criterion)
+            log_output = agg.get_smoothed_values()
+
+        progress.print(log_output, tag=subset, step=i)
+
+
+def cli_main():
+    try:
+        from hydra._internal.utils import get_args
+
+        cfg_name = get_args().config_name or "config"
+    except:
+        logger.warning("Failed to get config name from hydra args")
+        cfg_name = "config"
+
+    hydra_init(cfg_name)
+    hydra_main()
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
new file mode 100644
index 0000000000..03265d00e8
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Translate raw text with a trained model. Batches data on-the-fly.
+"""
+
+import ast
+import fileinput
+import logging
+import math
+import os
+import sys
+import time
+from argparse import Namespace
+from collections import namedtuple
+
+import numpy as np
+import torch
+
+from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
+from fairseq.dataclass.configs import FairseqConfig
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.token_generation_constraints import pack_constraints, unpack_constraints
+from fairseq_cli.generate import get_symbols_to_strip_from_output
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.interactive")
+
+
+Batch = namedtuple("Batch", "ids src_tokens src_lengths constraints")
+Translation = namedtuple("Translation", "src_str hypos pos_scores alignments")
+
+
+def buffered_read(input, buffer_size):
+    buffer = []
+    with fileinput.input(files=[input], openhook=fileinput.hook_encoded("utf-8")) as h:
+        for src_str in h:
+            buffer.append(src_str.strip())
+            if len(buffer) >= buffer_size:
+                yield buffer
+                buffer = []
+
+    if len(buffer) > 0:
+        yield buffer
+
+
+def make_batches(lines, cfg, task, max_positions, encode_fn):
+    def encode_fn_target(x):
+        return encode_fn(x)
+
+    if cfg.generation.constraints:
+        # Strip (tab-delimited) contraints, if present, from input lines,
+        # store them in batch_constraints
+        batch_constraints = [list() for _ in lines]
+        for i, line in enumerate(lines):
+            if "\t" in line:
+                lines[i], *batch_constraints[i] = line.split("\t")
+
+        # Convert each List[str] to List[Tensor]
+        for i, constraint_list in enumerate(batch_constraints):
+            batch_constraints[i] = [
+                task.target_dictionary.encode_line(
+                    encode_fn_target(constraint),
+                    append_eos=False,
+                    add_if_not_exist=False,
+                )
+                for constraint in constraint_list
+            ]
+
+    if cfg.generation.constraints:
+        constraints_tensor = pack_constraints(batch_constraints)
+    else:
+        constraints_tensor = None
+
+    tokens, lengths = task.get_interactive_tokens_and_lengths(lines, encode_fn)
+
+    itr = task.get_batch_iterator(
+        dataset=task.build_dataset_for_inference(
+            tokens, lengths, constraints=constraints_tensor
+        ),
+        max_tokens=cfg.dataset.max_tokens,
+        max_sentences=cfg.dataset.batch_size,
+        max_positions=max_positions,
+        ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+    ).next_epoch_itr(shuffle=False)
+    for batch in itr:
+        ids = batch["id"]
+        src_tokens = batch["net_input"]["src_tokens"]
+        src_lengths = batch["net_input"]["src_lengths"]
+        constraints = batch.get("constraints", None)
+
+        yield Batch(
+            ids=ids,
+            src_tokens=src_tokens,
+            src_lengths=src_lengths,
+            constraints=constraints,
+        )
+
+
+def main(cfg: FairseqConfig):
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    start_time = time.time()
+    total_translate_time = 0
+
+    utils.import_user_module(cfg.common)
+
+    if cfg.interactive.buffer_size < 1:
+        cfg.interactive.buffer_size = 1
+    if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None:
+        cfg.dataset.batch_size = 1
+
+    assert (
+        not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        not cfg.dataset.batch_size
+        or cfg.dataset.batch_size <= cfg.interactive.buffer_size
+    ), "--batch-size cannot be larger than --buffer-size"
+
+    logger.info(cfg)
+
+    # Fix seed for stochastic decoding
+    if cfg.common.seed is not None and not cfg.generation.no_seed_provided:
+        np.random.seed(cfg.common.seed)
+        utils.set_torch_seed(cfg.common.seed)
+
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    # Setup task, e.g., translation
+    task = tasks.setup_task(cfg.task)
+
+    # Load ensemble
+    overrides = ast.literal_eval(cfg.common_eval.model_overrides)
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, _model_args = checkpoint_utils.load_model_ensemble(
+        utils.split_paths(cfg.common_eval.path),
+        arg_overrides=overrides,
+        task=task,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+        strict=(cfg.checkpoint.checkpoint_shard_count == 1),
+        num_shards=cfg.checkpoint.checkpoint_shard_count,
+    )
+
+    # Set dictionaries
+    src_dict = task.source_dictionary
+    tgt_dict = task.target_dictionary
+
+    # Optimize ensemble for generation
+    for model in models:
+        if model is None:
+            continue
+        if cfg.common.fp16:
+            model.half()
+        if use_cuda and not cfg.distributed_training.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(cfg)
+
+    # Initialize generator
+    generator = task.build_generator(models, cfg.generation)
+
+    # Handle tokenization and BPE
+    tokenizer = task.build_tokenizer(cfg.tokenizer)
+    bpe = task.build_bpe(cfg.bpe)
+
+    def encode_fn(x):
+        if tokenizer is not None:
+            x = tokenizer.encode(x)
+        if bpe is not None:
+            x = bpe.encode(x)
+        return x
+
+    def decode_fn(x):
+        if bpe is not None:
+            x = bpe.decode(x)
+        if tokenizer is not None:
+            x = tokenizer.decode(x)
+        return x
+
+    # Load alignment dictionary for unknown word replacement
+    # (None if no unknown word replacement, empty if no path to align dictionary)
+    align_dict = utils.load_align_dict(cfg.generation.replace_unk)
+
+    max_positions = utils.resolve_max_positions(
+        task.max_positions(), *[model.max_positions() for model in models]
+    )
+
+    if cfg.generation.constraints:
+        logger.warning(
+            "NOTE: Constrained decoding currently assumes a shared subword vocabulary."
+        )
+
+    if cfg.interactive.buffer_size > 1:
+        logger.info("Sentence buffer size: %s", cfg.interactive.buffer_size)
+    logger.info("NOTE: hypothesis and token scores are output in base 2")
+    logger.info("Type the input sentence and press return:")
+    start_id = 0
+    for inputs in buffered_read(cfg.interactive.input, cfg.interactive.buffer_size):
+        results = []
+        for batch in make_batches(inputs, cfg, task, max_positions, encode_fn):
+            bsz = batch.src_tokens.size(0)
+            src_tokens = batch.src_tokens
+            src_lengths = batch.src_lengths
+            constraints = batch.constraints
+            if use_cuda:
+                src_tokens = src_tokens.cuda()
+                src_lengths = src_lengths.cuda()
+                if constraints is not None:
+                    constraints = constraints.cuda()
+
+            sample = {
+                "net_input": {
+                    "src_tokens": src_tokens,
+                    "src_lengths": src_lengths,
+                },
+            }
+            translate_start_time = time.time()
+            translations = task.inference_step(
+                generator, models, sample, constraints=constraints
+            )
+            translate_time = time.time() - translate_start_time
+            total_translate_time += translate_time
+            list_constraints = [[] for _ in range(bsz)]
+            if cfg.generation.constraints:
+                list_constraints = [unpack_constraints(c) for c in constraints]
+            for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)):
+                src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad())
+                constraints = list_constraints[i]
+                results.append(
+                    (
+                        start_id + id,
+                        src_tokens_i,
+                        hypos,
+                        {
+                            "constraints": constraints,
+                            "time": translate_time / len(translations),
+                        },
+                    )
+                )
+
+        # sort output to match input order
+        for id_, src_tokens, hypos, info in sorted(results, key=lambda x: x[0]):
+            src_str = ""
+            if src_dict is not None:
+                src_str = src_dict.string(src_tokens, cfg.common_eval.post_process)
+                print("S-{}\t{}".format(id_, src_str))
+                print("W-{}\t{:.3f}\tseconds".format(id_, info["time"]))
+                for constraint in info["constraints"]:
+                    print(
+                        "C-{}\t{}".format(
+                            id_,
+                            tgt_dict.string(constraint, cfg.common_eval.post_process),
+                        )
+                    )
+
+            # Process top predictions
+            for hypo in hypos[: min(len(hypos), cfg.generation.nbest)]:
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                    hypo_tokens=hypo["tokens"].int().cpu(),
+                    src_str=src_str,
+                    alignment=hypo["alignment"],
+                    align_dict=align_dict,
+                    tgt_dict=tgt_dict,
+                    remove_bpe=cfg.common_eval.post_process,
+                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
+                )
+                detok_hypo_str = decode_fn(hypo_str)
+                score = hypo["score"] / math.log(2)  # convert to base 2
+                # original hypothesis (after tokenization and BPE)
+                print("H-{}\t{}\t{}".format(id_, score, hypo_str))
+                # detokenized hypothesis
+                print("D-{}\t{}\t{}".format(id_, score, detok_hypo_str))
+                print(
+                    "P-{}\t{}".format(
+                        id_,
+                        " ".join(
+                            map(
+                                lambda x: "{:.4f}".format(x),
+                                # convert from base e to base 2
+                                hypo["positional_scores"].div_(math.log(2)).tolist(),
+                            )
+                        ),
+                    )
+                )
+                if cfg.generation.print_alignment:
+                    alignment_str = " ".join(
+                        ["{}-{}".format(src, tgt) for src, tgt in alignment]
+                    )
+                    print("A-{}\t{}".format(id_, alignment_str))
+
+        # update running id_ counter
+        start_id += len(inputs)
+
+    logger.info(
+        "Total time: {:.3f} seconds; translation time: {:.3f}".format(
+            time.time() - start_time, total_translate_time
+        )
+    )
+
+
+def cli_main():
+    parser = options.get_interactive_generation_parser()
+    args = options.parse_args_and_arch(parser)
+    distributed_utils.call_main(convert_namespace_to_omegaconf(args), main)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
new file mode 100644
index 0000000000..2ba9e09338
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data pre-processing: build vocabularies and binarize training data.
+"""
+
+import logging
+import os
+import shutil
+import sys
+import typing as tp
+from argparse import Namespace
+from itertools import zip_longest
+
+from fairseq import options, tasks, utils
+from fairseq.binarizer import (
+    AlignmentDatasetBinarizer,
+    FileBinarizer,
+    VocabularyDatasetBinarizer,
+)
+from fairseq.data import Dictionary
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.preprocess")
+
+#####################################################################
+# file name tools
+#####################################################################
+
+
+def _train_path(lang, trainpref):
+    return "{}{}".format(trainpref, ("." + lang) if lang else "")
+
+
+def _file_name(prefix, lang):
+    fname = prefix
+    if lang is not None:
+        fname += ".{lang}".format(lang=lang)
+    return fname
+
+
+def _dest_path(prefix, lang, destdir):
+    return os.path.join(destdir, _file_name(prefix, lang))
+
+
+def _dict_path(lang, destdir):
+    return _dest_path("dict", lang, destdir) + ".txt"
+
+
+def dataset_dest_prefix(args, output_prefix, lang):
+    base = os.path.join(args.destdir, output_prefix)
+    if lang is not None:
+        lang_part = f".{args.source_lang}-{args.target_lang}.{lang}"
+    elif args.only_source:
+        lang_part = ""
+    else:
+        lang_part = f".{args.source_lang}-{args.target_lang}"
+
+    return "{}{}".format(base, lang_part)
+
+
+def dataset_dest_file(args, output_prefix, lang, extension):
+    return "{}.{}".format(dataset_dest_prefix(args, output_prefix, lang), extension)
+
+
+#####################################################################
+# dictionary tools
+#####################################################################
+
+
+def _build_dictionary(
+    filenames,
+    task,
+    args,
+    src=False,
+    tgt=False,
+):
+    assert src ^ tgt
+    return task.build_dictionary(
+        filenames,
+        workers=args.workers,
+        threshold=args.thresholdsrc if src else args.thresholdtgt,
+        nwords=args.nwordssrc if src else args.nwordstgt,
+        padding_factor=args.padding_factor,
+    )
+
+
+#####################################################################
+# bin file creation logic
+#####################################################################
+
+
+def _make_binary_dataset(
+    vocab: Dictionary,
+    input_prefix: str,
+    output_prefix: str,
+    lang: tp.Optional[str],
+    num_workers: int,
+    args: Namespace,
+):
+    logger.info("[{}] Dictionary: {} types".format(lang, len(vocab)))
+
+    binarizer = VocabularyDatasetBinarizer(
+        vocab,
+        append_eos=True,
+    )
+
+    input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "")
+    full_output_prefix = dataset_dest_prefix(args, output_prefix, lang)
+
+    final_summary = FileBinarizer.multiprocess_dataset(
+        input_file,
+        args.dataset_impl,
+        binarizer,
+        full_output_prefix,
+        vocab_size=len(vocab),
+        num_workers=num_workers,
+    )
+
+    logger.info(f"[{lang}] {input_file}: {final_summary} (by {vocab.unk_word})")
+
+
+def _make_binary_alignment_dataset(
+    input_prefix: str, output_prefix: str, num_workers: int, args: Namespace
+):
+
+    binarizer = AlignmentDatasetBinarizer(utils.parse_alignment)
+
+    input_file = input_prefix
+    full_output_prefix = dataset_dest_prefix(args, output_prefix, lang=None)
+
+    final_summary = FileBinarizer.multiprocess_dataset(
+        input_file,
+        args.dataset_impl,
+        binarizer,
+        full_output_prefix,
+        vocab_size=None,
+        num_workers=num_workers,
+    )
+
+    logger.info(
+        "[alignments] {}: parsed {} alignments".format(
+            input_file, final_summary.num_seq
+        )
+    )
+
+
+#####################################################################
+# routing logic
+#####################################################################
+
+
+def _make_dataset(
+    vocab: Dictionary,
+    input_prefix: str,
+    output_prefix: str,
+    lang: tp.Optional[str],
+    args: Namespace,
+    num_workers: int,
+):
+    if args.dataset_impl == "raw":
+        # Copy original text file to destination folder
+        output_text_file = _dest_path(
+            output_prefix + ".{}-{}".format(args.source_lang, args.target_lang),
+            lang,
+            args.destdir,
+        )
+        shutil.copyfile(_file_name(input_prefix, lang), output_text_file)
+    else:
+        _make_binary_dataset(
+            vocab, input_prefix, output_prefix, lang, num_workers, args
+        )
+
+
+def _make_all(lang, vocab, args):
+    if args.trainpref:
+        _make_dataset(
+            vocab, args.trainpref, "train", lang, args=args, num_workers=args.workers
+        )
+    if args.validpref:
+        for k, validpref in enumerate(args.validpref.split(",")):
+            outprefix = "valid{}".format(k) if k > 0 else "valid"
+            _make_dataset(
+                vocab, validpref, outprefix, lang, args=args, num_workers=args.workers
+            )
+    if args.testpref:
+        for k, testpref in enumerate(args.testpref.split(",")):
+            outprefix = "test{}".format(k) if k > 0 else "test"
+            _make_dataset(
+                vocab, testpref, outprefix, lang, args=args, num_workers=args.workers
+            )
+
+
+def _make_all_alignments(args):
+    if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix):
+        _make_binary_alignment_dataset(
+            args.trainpref + "." + args.align_suffix,
+            "train.align",
+            num_workers=args.workers,
+            args=args,
+        )
+    if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix):
+        _make_binary_alignment_dataset(
+            args.validpref + "." + args.align_suffix,
+            "valid.align",
+            num_workers=args.workers,
+            args=args,
+        )
+    if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix):
+        _make_binary_alignment_dataset(
+            args.testpref + "." + args.align_suffix,
+            "test.align",
+            num_workers=args.workers,
+            args=args,
+        )
+
+
+#####################################################################
+# align
+#####################################################################
+
+
+def _align_files(args, src_dict, tgt_dict):
+    assert args.trainpref, "--trainpref must be set if --alignfile is specified"
+    src_file_name = _train_path(args.source_lang, args.trainpref)
+    tgt_file_name = _train_path(args.target_lang, args.trainpref)
+    freq_map = {}
+    with open(args.alignfile, "r", encoding="utf-8") as align_file:
+        with open(src_file_name, "r", encoding="utf-8") as src_file:
+            with open(tgt_file_name, "r", encoding="utf-8") as tgt_file:
+                for a, s, t in zip_longest(align_file, src_file, tgt_file):
+                    si = src_dict.encode_line(s, add_if_not_exist=False)
+                    ti = tgt_dict.encode_line(t, add_if_not_exist=False)
+                    ai = list(map(lambda x: tuple(x.split("-")), a.split()))
+                    for sai, tai in ai:
+                        srcidx = si[int(sai)]
+                        tgtidx = ti[int(tai)]
+                        if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
+                            assert srcidx != src_dict.pad()
+                            assert srcidx != src_dict.eos()
+                            assert tgtidx != tgt_dict.pad()
+                            assert tgtidx != tgt_dict.eos()
+                            if srcidx not in freq_map:
+                                freq_map[srcidx] = {}
+                            if tgtidx not in freq_map[srcidx]:
+                                freq_map[srcidx][tgtidx] = 1
+                            else:
+                                freq_map[srcidx][tgtidx] += 1
+    align_dict = {}
+    for srcidx in freq_map.keys():
+        align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)
+    with open(
+        os.path.join(
+            args.destdir,
+            "alignment.{}-{}.txt".format(args.source_lang, args.target_lang),
+        ),
+        "w",
+        encoding="utf-8",
+    ) as f:
+        for k, v in align_dict.items():
+            print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
+
+
+#####################################################################
+# MAIN
+#####################################################################
+
+
+def main(args):
+    # setup some basic things
+    utils.import_user_module(args)
+
+    os.makedirs(args.destdir, exist_ok=True)
+
+    logger.addHandler(
+        logging.FileHandler(
+            filename=os.path.join(args.destdir, "preprocess.log"),
+        )
+    )
+    logger.info(args)
+
+    assert (
+        args.dataset_impl != "huffman"
+    ), "preprocessing.py doesn't support Huffman yet, use HuffmanCodeBuilder directly."
+
+    # build dictionaries
+
+    target = not args.only_source
+
+    if not args.srcdict and os.path.exists(_dict_path(args.source_lang, args.destdir)):
+        raise FileExistsError(_dict_path(args.source_lang, args.destdir))
+
+    if (
+        target
+        and not args.tgtdict
+        and os.path.exists(_dict_path(args.target_lang, args.destdir))
+    ):
+        raise FileExistsError(_dict_path(args.target_lang, args.destdir))
+
+    task = tasks.get_task(args.task)
+
+    if args.joined_dictionary:
+        assert (
+            not args.srcdict or not args.tgtdict
+        ), "cannot use both --srcdict and --tgtdict with --joined-dictionary"
+
+        if args.srcdict:
+            src_dict = task.load_dictionary(args.srcdict)
+        elif args.tgtdict:
+            src_dict = task.load_dictionary(args.tgtdict)
+        else:
+            assert (
+                args.trainpref
+            ), "--trainpref must be set if --srcdict is not specified"
+            src_dict = _build_dictionary(
+                {
+                    _train_path(lang, args.trainpref)
+                    for lang in [args.source_lang, args.target_lang]
+                },
+                task=task,
+                args=args,
+                src=True,
+            )
+        tgt_dict = src_dict
+    else:
+        if args.srcdict:
+            src_dict = task.load_dictionary(args.srcdict)
+        else:
+            assert (
+                args.trainpref
+            ), "--trainpref must be set if --srcdict is not specified"
+            src_dict = _build_dictionary(
+                [_train_path(args.source_lang, args.trainpref)],
+                task=task,
+                args=args,
+                src=True,
+            )
+
+        if target:
+            if args.tgtdict:
+                tgt_dict = task.load_dictionary(args.tgtdict)
+            else:
+                assert (
+                    args.trainpref
+                ), "--trainpref must be set if --tgtdict is not specified"
+                tgt_dict = _build_dictionary(
+                    [_train_path(args.target_lang, args.trainpref)],
+                    task=task,
+                    args=args,
+                    tgt=True,
+                )
+        else:
+            tgt_dict = None
+
+    # save dictionaries
+
+    src_dict.save(_dict_path(args.source_lang, args.destdir))
+    if target and tgt_dict is not None:
+        tgt_dict.save(_dict_path(args.target_lang, args.destdir))
+
+    if args.dict_only:
+        return
+
+    _make_all(args.source_lang, src_dict, args)
+    if target:
+        _make_all(args.target_lang, tgt_dict, args)
+
+    # align the datasets if needed
+    if args.align_suffix:
+        _make_all_alignments(args)
+
+    logger.info("Wrote preprocessed data to {}".format(args.destdir))
+
+    if args.alignfile:
+        _align_files(args, src_dict=src_dict, tgt_dict=tgt_dict)
+
+
+def cli_main():
+    parser = options.get_preprocessing_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
new file mode 100644
index 0000000000..0b207be959
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+BLEU scoring of generated translations against reference translations.
+"""
+
+import argparse
+import os
+import sys
+
+from fairseq.data import dictionary
+from fairseq.scoring import bleu
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Command-line script for BLEU scoring."
+    )
+    # fmt: off
+    parser.add_argument('-s', '--sys', default='-', help='system output')
+    parser.add_argument('-r', '--ref', required=True, help='references')
+    parser.add_argument('-o', '--order', default=4, metavar='N',
+                        type=int, help='consider ngrams up to this order')
+    parser.add_argument('--ignore-case', action='store_true',
+                        help='case-insensitive scoring')
+    parser.add_argument('--sacrebleu', action='store_true',
+                        help='score with sacrebleu')
+    parser.add_argument('--sentence-bleu', action='store_true',
+                        help='report sentence-level BLEUs (i.e., with +1 smoothing)')
+    # fmt: on
+    return parser
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    print(args)
+
+    assert args.sys == "-" or os.path.exists(
+        args.sys
+    ), "System output file {} does not exist".format(args.sys)
+    assert os.path.exists(args.ref), "Reference file {} does not exist".format(args.ref)
+
+    dict = dictionary.Dictionary()
+
+    def readlines(fd):
+        for line in fd.readlines():
+            if args.ignore_case:
+                yield line.lower()
+            else:
+                yield line
+
+    if args.sacrebleu:
+        import sacrebleu
+
+        def score(fdsys):
+            with open(args.ref) as fdref:
+                print(sacrebleu.corpus_bleu(fdsys, [fdref]).format())
+
+    elif args.sentence_bleu:
+
+        def score(fdsys):
+            with open(args.ref) as fdref:
+                scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
+                for i, (sys_tok, ref_tok) in enumerate(
+                    zip(readlines(fdsys), readlines(fdref))
+                ):
+                    scorer.reset(one_init=True)
+                    sys_tok = dict.encode_line(sys_tok)
+                    ref_tok = dict.encode_line(ref_tok)
+                    scorer.add(ref_tok, sys_tok)
+                    print(i, scorer.result_string(args.order))
+
+    else:
+
+        def score(fdsys):
+            with open(args.ref) as fdref:
+                scorer = bleu.Scorer(
+                    bleu.BleuConfig(
+                        pad=dict.pad(),
+                        eos=dict.eos(),
+                        unk=dict.unk(),
+                    )
+                )
+                for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
+                    sys_tok = dict.encode_line(sys_tok)
+                    ref_tok = dict.encode_line(ref_tok)
+                    scorer.add(ref_tok, sys_tok)
+                print(scorer.result_string(args.order))
+
+    if args.sys == "-":
+        score(sys.stdin)
+    else:
+        with open(args.sys, "r") as f:
+            score(f)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
new file mode 100644
index 0000000000..f771bff654
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
@@ -0,0 +1,581 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Train a new model on one or across multiple GPUs.
+"""
+
+import argparse
+import logging
+import math
+import os
+import sys
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+# We need to setup root logger before importing any fairseq libraries.
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.train")
+
+import numpy as np
+import torch
+from omegaconf import DictConfig, OmegaConf
+
+from fairseq import checkpoint_utils, options, quantization_utils, tasks, utils
+from fairseq.data import data_utils, iterators
+from fairseq.data.plasma_utils import PlasmaStore
+from fairseq.dataclass.configs import FairseqConfig
+from fairseq.dataclass.initialize import add_defaults
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.distributed import fsdp_enable_wrap, fsdp_wrap
+from fairseq.distributed import utils as distributed_utils
+from fairseq.file_io import PathManager
+from fairseq.logging import meters, metrics, progress_bar
+from fairseq.model_parallel.megatron_trainer import MegatronTrainer
+from fairseq.trainer import Trainer
+
+
+def main(cfg: FairseqConfig) -> None:
+    if isinstance(cfg, argparse.Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    utils.import_user_module(cfg.common)
+    add_defaults(cfg)
+
+    if (
+        distributed_utils.is_master(cfg.distributed_training)
+        and "job_logging_cfg" in cfg
+    ):
+        # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126)
+        logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_cfg))
+
+    assert (
+        cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None
+    ), "Must specify batch size either with --max-tokens or --batch-size"
+    metrics.reset()
+
+    if cfg.common.log_file is not None:
+        handler = logging.FileHandler(filename=cfg.common.log_file)
+        logger.addHandler(handler)
+
+    np.random.seed(cfg.common.seed)
+    utils.set_torch_seed(cfg.common.seed)
+
+    if distributed_utils.is_master(cfg.distributed_training):
+        checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir)
+
+    # Print args
+    logger.info(cfg)
+
+    if cfg.checkpoint.write_checkpoints_asynchronously:
+        try:
+            import iopath  # noqa: F401
+        except ImportError:
+            logging.exception(
+                "Asynchronous checkpoint writing is specified but iopath is "
+                "not installed: `pip install iopath`"
+            )
+            return
+
+    # Setup task, e.g., translation, language modeling, etc.
+    task = tasks.setup_task(cfg.task)
+
+    assert cfg.criterion, "Please specify criterion to train a model"
+
+    # Build model and criterion
+    if cfg.distributed_training.ddp_backend == "fully_sharded":
+        with fsdp_enable_wrap(cfg.distributed_training):
+            model = fsdp_wrap(task.build_model(cfg.model))
+    else:
+        model = task.build_model(cfg.model)
+    criterion = task.build_criterion(cfg.criterion)
+    logger.info(model)
+    logger.info("task: {}".format(task.__class__.__name__))
+    logger.info("model: {}".format(model.__class__.__name__))
+    logger.info("criterion: {}".format(criterion.__class__.__name__))
+    logger.info(
+        "num. shared model params: {:,} (num. trained: {:,})".format(
+            sum(
+                p.numel() for p in model.parameters() if not getattr(p, "expert", False)
+            ),
+            sum(
+                p.numel()
+                for p in model.parameters()
+                if not getattr(p, "expert", False) and p.requires_grad
+            ),
+        )
+    )
+
+    logger.info(
+        "num. expert model params: {} (num. trained: {})".format(
+            sum(p.numel() for p in model.parameters() if getattr(p, "expert", False)),
+            sum(
+                p.numel()
+                for p in model.parameters()
+                if getattr(p, "expert", False) and p.requires_grad
+            ),
+        )
+    )
+
+    # Load valid dataset (we load training data below, based on the latest checkpoint)
+    # We load the valid dataset AFTER building the model
+    if not cfg.dataset.disable_validation:
+        data_utils.raise_if_valid_subsets_unintentionally_ignored(cfg)
+        if cfg.dataset.combine_valid_subsets:
+            task.load_dataset("valid", combine=True, epoch=1)
+        else:
+            for valid_sub_split in cfg.dataset.valid_subset.split(","):
+                task.load_dataset(valid_sub_split, combine=False, epoch=1)
+
+    # (optionally) Configure quantization
+    if cfg.common.quantization_config_path is not None:
+        quantizer = quantization_utils.Quantizer(
+            config_path=cfg.common.quantization_config_path,
+            max_epoch=cfg.optimization.max_epoch,
+            max_update=cfg.optimization.max_update,
+        )
+    else:
+        quantizer = None
+
+    # Build trainer
+    if cfg.common.model_parallel_size == 1:
+        trainer = Trainer(cfg, task, model, criterion, quantizer)
+    else:
+        trainer = MegatronTrainer(cfg, task, model, criterion)
+    logger.info(
+        "training on {} devices (GPUs/TPUs)".format(
+            cfg.distributed_training.distributed_world_size
+        )
+    )
+    logger.info(
+        "max tokens per device = {} and max sentences per device = {}".format(
+            cfg.dataset.max_tokens,
+            cfg.dataset.batch_size,
+        )
+    )
+
+    # Load the latest checkpoint if one is available and restore the
+    # corresponding train iterator
+    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(
+        cfg.checkpoint,
+        trainer,
+        # don't cache epoch iterators for sharded datasets
+        disable_iterator_cache=task.has_sharded_data("train"),
+    )
+    if cfg.common.tpu:
+        import torch_xla.core.xla_model as xm
+
+        xm.rendezvous("load_checkpoint")  # wait for all workers
+
+    max_epoch = cfg.optimization.max_epoch or math.inf
+    lr = trainer.get_lr()
+
+    # TODO: a dry run on validation set to pin the memory
+    valid_subsets = cfg.dataset.valid_subset.split(",")
+    if not cfg.dataset.disable_validation:
+        for subset in valid_subsets:
+            logger.info('begin dry-run validation on "{}" subset'.format(subset))
+            itr = trainer.get_valid_iterator(subset).next_epoch_itr(
+                shuffle=False, set_dataset_epoch=False  # use a fixed valid set
+            )
+            if cfg.common.tpu:
+                itr = utils.tpu_data_loader(itr)
+            for _ in itr:
+                pass
+    # TODO: end of dry run section
+
+    train_meter = meters.StopwatchMeter()
+    train_meter.start()
+    while epoch_itr.next_epoch_idx <= max_epoch:
+        if lr <= cfg.optimization.stop_min_lr:
+            logger.info(
+                f"stopping training because current learning rate ({lr}) is smaller "
+                "than or equal to minimum learning rate "
+                f"(--stop-min-lr={cfg.optimization.stop_min_lr})"
+            )
+            break
+
+        # train for one epoch
+        valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
+        if should_stop:
+            break
+
+        # only use first validation loss to update the learning rate
+        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])
+
+        epoch_itr = trainer.get_train_iterator(
+            epoch_itr.next_epoch_idx,
+            # sharded data: get train iterator for next epoch
+            load_dataset=task.has_sharded_data("train"),
+            # don't cache epoch iterators for sharded datasets
+            disable_iterator_cache=task.has_sharded_data("train"),
+        )
+    train_meter.stop()
+    logger.info("done training in {:.1f} seconds".format(train_meter.sum))
+
+    # ioPath implementation to wait for all asynchronous file writes to complete.
+    if cfg.checkpoint.write_checkpoints_asynchronously:
+        logger.info(
+            "ioPath PathManager waiting for all asynchronous checkpoint "
+            "writes to finish."
+        )
+        PathManager.async_close()
+        logger.info("ioPath PathManager finished waiting.")
+
+
+def should_stop_early(cfg: DictConfig, valid_loss: float) -> bool:
+    # skip check if no validation was done in the current epoch
+    if valid_loss is None:
+        return False
+    if cfg.checkpoint.patience <= 0:
+        return False
+
+    def is_better(a, b):
+        return a > b if cfg.checkpoint.maximize_best_checkpoint_metric else a < b
+
+    prev_best = getattr(should_stop_early, "best", None)
+    if prev_best is None or is_better(valid_loss, prev_best):
+        should_stop_early.best = valid_loss
+        should_stop_early.num_runs = 0
+        return False
+    else:
+        should_stop_early.num_runs += 1
+        if should_stop_early.num_runs >= cfg.checkpoint.patience:
+            logger.info(
+                "early stop since valid performance hasn't improved for last {} runs".format(
+                    cfg.checkpoint.patience
+                )
+            )
+            return True
+        else:
+            return False
+
+
+@metrics.aggregate("train")
+def train(
+    cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr
+) -> Tuple[List[Optional[float]], bool]:
+    """Train the model for one epoch and return validation losses."""
+    # Initialize data iterator
+    itr = epoch_itr.next_epoch_itr(
+        fix_batches_to_gpus=cfg.distributed_training.fix_batches_to_gpus,
+        shuffle=(epoch_itr.next_epoch_idx > cfg.dataset.curriculum),
+    )
+    update_freq = (
+        cfg.optimization.update_freq[epoch_itr.epoch - 1]
+        if epoch_itr.epoch <= len(cfg.optimization.update_freq)
+        else cfg.optimization.update_freq[-1]
+    )
+    itr = iterators.GroupedIterator(
+        itr,
+        update_freq,
+        skip_remainder_batch=cfg.optimization.skip_remainder_batch,
+    )
+    if cfg.common.tpu:
+        itr = utils.tpu_data_loader(itr)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=cfg.common.log_format,
+        log_file=cfg.common.log_file,
+        log_interval=cfg.common.log_interval,
+        epoch=epoch_itr.epoch,
+        aim_repo=(
+            cfg.common.aim_repo
+            if distributed_utils.is_master(cfg.distributed_training)
+            else None
+        ),
+        aim_run_hash=(
+            cfg.common.aim_run_hash
+            if distributed_utils.is_master(cfg.distributed_training)
+            else None
+        ),
+        aim_param_checkpoint_dir=cfg.checkpoint.save_dir,
+        tensorboard_logdir=(
+            cfg.common.tensorboard_logdir
+            if distributed_utils.is_master(cfg.distributed_training)
+            else None
+        ),
+        default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+        wandb_project=(
+            cfg.common.wandb_project
+            if distributed_utils.is_master(cfg.distributed_training)
+            else None
+        ),
+        wandb_run_name=os.environ.get(
+            "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir)
+        ),
+        azureml_logging=(
+            cfg.common.azureml_logging
+            if distributed_utils.is_master(cfg.distributed_training)
+            else False
+        ),
+    )
+    progress.update_config(_flatten_config(cfg))
+
+    trainer.begin_epoch(epoch_itr.epoch)
+
+    valid_subsets = cfg.dataset.valid_subset.split(",")
+    should_stop = False
+    num_updates = trainer.get_num_updates()
+    logger.info("Start iterating over samples")
+    for i, samples in enumerate(progress):
+        with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function(
+            "train_step-%d" % i
+        ):
+            log_output = trainer.train_step(samples)
+
+        if log_output is not None:  # not OOM, overflow, ...
+            # log mid-epoch stats
+            num_updates = trainer.get_num_updates()
+            if num_updates % cfg.common.log_interval == 0:
+                stats = get_training_stats(metrics.get_smoothed_values("train_inner"))
+                progress.log(stats, tag="train_inner", step=num_updates)
+
+                # reset mid-epoch stats after each log interval
+                # the end-of-epoch stats will still be preserved
+                metrics.reset_meters("train_inner")
+
+        end_of_epoch = not itr.has_next()
+        valid_losses, should_stop = validate_and_save(
+            cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch
+        )
+
+        if should_stop:
+            break
+
+    # log end-of-epoch stats
+    logger.info("end of epoch {} (average epoch stats below)".format(epoch_itr.epoch))
+    stats = get_training_stats(metrics.get_smoothed_values("train"))
+    progress.print(stats, tag="train", step=num_updates)
+
+    # reset epoch-level meters
+    metrics.reset_meters("train")
+    return valid_losses, should_stop
+
+
+def _flatten_config(cfg: DictConfig):
+    config = OmegaConf.to_container(cfg)
+    # remove any legacy Namespaces and replace with a single "args"
+    namespace = None
+    for k, v in list(config.items()):
+        if isinstance(v, argparse.Namespace):
+            namespace = v
+            del config[k]
+    if namespace is not None:
+        config["args"] = vars(namespace)
+    return config
+
+
+def validate_and_save(
+    cfg: DictConfig,
+    trainer: Trainer,
+    task: tasks.FairseqTask,
+    epoch_itr,
+    valid_subsets: List[str],
+    end_of_epoch: bool,
+) -> Tuple[List[Optional[float]], bool]:
+    num_updates = trainer.get_num_updates()
+    max_update = cfg.optimization.max_update or math.inf
+
+    # Stopping conditions (and an additional one based on validation loss later
+    # on)
+    should_stop = False
+    if num_updates >= max_update:
+        should_stop = True
+        logger.info(
+            f"Stopping training due to "
+            f"num_updates: {num_updates} >= max_update: {max_update}"
+        )
+
+    training_time_hours = trainer.cumulative_training_time() / (60 * 60)
+    if (
+        cfg.optimization.stop_time_hours > 0
+        and training_time_hours > cfg.optimization.stop_time_hours
+    ):
+        should_stop = True
+        logger.info(
+            f"Stopping training due to "
+            f"cumulative_training_time: {training_time_hours} > "
+            f"stop_time_hours: {cfg.optimization.stop_time_hours} hour(s)"
+        )
+
+    do_save = (
+        (end_of_epoch and epoch_itr.epoch % cfg.checkpoint.save_interval == 0)
+        or should_stop
+        or (
+            cfg.checkpoint.save_interval_updates > 0
+            and num_updates > 0
+            and num_updates % cfg.checkpoint.save_interval_updates == 0
+            and num_updates >= cfg.dataset.validate_after_updates
+        )
+    )
+    do_validate = (
+        (
+            (not end_of_epoch and do_save)  # validate during mid-epoch saves
+            or (end_of_epoch and epoch_itr.epoch % cfg.dataset.validate_interval == 0)
+            or should_stop
+            or (
+                cfg.dataset.validate_interval_updates > 0
+                and num_updates > 0
+                and num_updates % cfg.dataset.validate_interval_updates == 0
+            )
+        )
+        and not cfg.dataset.disable_validation
+        and num_updates >= cfg.dataset.validate_after_updates
+    )
+
+    # Validate
+    valid_losses = [None]
+    if do_validate:
+        valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets)
+
+    should_stop |= should_stop_early(cfg, valid_losses[0])
+
+    # Save checkpoint
+    if do_save or should_stop:
+        cp_path = checkpoint_utils.save_checkpoint(
+            cfg.checkpoint, trainer, epoch_itr, valid_losses[0]
+        )
+        if cp_path is not None and hasattr(task, "post_save"):
+            task.post_save(cp_path, num_updates)
+
+    return valid_losses, should_stop
+
+
+def get_training_stats(stats: Dict[str, Any]) -> Dict[str, Any]:
+    stats["wall"] = round(metrics.get_meter("default", "wall").elapsed_time, 0)
+    return stats
+
+
+def validate(
+    cfg: DictConfig,
+    trainer: Trainer,
+    task: tasks.FairseqTask,
+    epoch_itr,
+    subsets: List[str],
+) -> List[Optional[float]]:
+    """Evaluate the model on the validation set(s) and return the losses."""
+
+    if cfg.dataset.fixed_validation_seed is not None:
+        # set fixed seed for every validation
+        utils.set_torch_seed(cfg.dataset.fixed_validation_seed)
+
+    trainer.begin_valid_epoch(epoch_itr.epoch)
+    valid_losses = []
+    for subset_idx, subset in enumerate(subsets):
+        logger.info('begin validation on "{}" subset'.format(subset))
+
+        # Initialize data iterator
+        itr = trainer.get_valid_iterator(subset).next_epoch_itr(
+            shuffle=False, set_dataset_epoch=False  # use a fixed valid set
+        )
+        if cfg.common.tpu:
+            itr = utils.tpu_data_loader(itr)
+        progress = progress_bar.progress_bar(
+            itr,
+            log_format=cfg.common.log_format,
+            log_interval=cfg.common.log_interval,
+            epoch=epoch_itr.epoch,
+            prefix=f"valid on '{subset}' subset",
+            aim_repo=(
+                cfg.common.aim_repo
+                if distributed_utils.is_master(cfg.distributed_training)
+                else None
+            ),
+            aim_run_hash=(
+                cfg.common.aim_run_hash
+                if distributed_utils.is_master(cfg.distributed_training)
+                else None
+            ),
+            aim_param_checkpoint_dir=cfg.checkpoint.save_dir,
+            tensorboard_logdir=(
+                cfg.common.tensorboard_logdir
+                if distributed_utils.is_master(cfg.distributed_training)
+                else None
+            ),
+            default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+            wandb_project=(
+                cfg.common.wandb_project
+                if distributed_utils.is_master(cfg.distributed_training)
+                else None
+            ),
+            wandb_run_name=os.environ.get(
+                "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir)
+            ),
+        )
+
+        # create a new root metrics aggregator so validation metrics
+        # don't pollute other aggregators (e.g., train meters)
+        with metrics.aggregate(new_root=True) as agg:
+            for i, sample in enumerate(progress):
+                if (
+                    cfg.dataset.max_valid_steps is not None
+                    and i > cfg.dataset.max_valid_steps
+                ):
+                    break
+                trainer.valid_step(sample)
+
+        # log validation stats
+        # only tracking the best metric on the 1st validation subset
+        tracking_best = subset_idx == 0
+        stats = get_valid_stats(cfg, trainer, agg.get_smoothed_values(), tracking_best)
+
+        if hasattr(task, "post_validate"):
+            task.post_validate(trainer.get_model(), stats, agg)
+
+        progress.print(stats, tag=subset, step=trainer.get_num_updates())
+
+        valid_losses.append(stats[cfg.checkpoint.best_checkpoint_metric])
+    return valid_losses
+
+
+def get_valid_stats(
+    cfg: DictConfig,
+    trainer: Trainer,
+    stats: Dict[str, Any],
+    tracking_best: bool,
+) -> Dict[str, Any]:
+    stats["num_updates"] = trainer.get_num_updates()
+    if tracking_best and hasattr(checkpoint_utils.save_checkpoint, "best"):
+        key = "best_{0}".format(cfg.checkpoint.best_checkpoint_metric)
+        best_function = max if cfg.checkpoint.maximize_best_checkpoint_metric else min
+        stats[key] = best_function(
+            checkpoint_utils.save_checkpoint.best,
+            stats[cfg.checkpoint.best_checkpoint_metric],
+        )
+    return stats
+
+
+def cli_main(
+    modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None
+) -> None:
+    parser = options.get_training_parser()
+    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)
+
+    cfg = convert_namespace_to_omegaconf(args)
+
+    if cfg.common.use_plasma_view:
+        server = PlasmaStore(path=cfg.common.plasma_path)
+        logger.info(
+            f"Started plasma server pid {server.server.pid} {cfg.common.plasma_path}"
+        )
+
+    if args.profile:
+        with torch.cuda.profiler.profile():
+            with torch.autograd.profiler.emit_nvtx():
+                distributed_utils.call_main(cfg, main)
+    else:
+        distributed_utils.call_main(cfg, main)
+
+    # if cfg.common.use_plasma_view:
+    #     server.server.kill()
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
new file mode 100644
index 0000000000..4617b6d542
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+from argparse import Namespace
+from itertools import chain
+
+import torch
+from omegaconf import DictConfig
+
+from fairseq import checkpoint_utils, distributed_utils, options, utils
+from fairseq.dataclass.utils import convert_namespace_to_omegaconf
+from fairseq.logging import metrics, progress_bar
+from fairseq.utils import reset_logging
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.validate")
+
+
+def main(cfg: DictConfig, override_args=None):
+    if isinstance(cfg, Namespace):
+        cfg = convert_namespace_to_omegaconf(cfg)
+
+    utils.import_user_module(cfg.common)
+
+    reset_logging()
+
+    assert (
+        cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None
+    ), "Must specify batch size either with --max-tokens or --batch-size"
+
+    use_fp16 = cfg.common.fp16
+    use_cuda = torch.cuda.is_available() and not cfg.common.cpu
+
+    if use_cuda:
+        torch.cuda.set_device(cfg.distributed_training.device_id)
+
+    if cfg.distributed_training.distributed_world_size > 1:
+        data_parallel_world_size = distributed_utils.get_data_parallel_world_size()
+        data_parallel_rank = distributed_utils.get_data_parallel_rank()
+    else:
+        data_parallel_world_size = 1
+        data_parallel_rank = 0
+
+    if override_args is not None:
+        overrides = vars(override_args)
+        overrides.update(eval(getattr(override_args, "model_overrides", "{}")))
+    else:
+        overrides = None
+
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(cfg.common_eval.path))
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [cfg.common_eval.path],
+        arg_overrides=overrides,
+        suffix=cfg.checkpoint.checkpoint_suffix,
+    )
+    model = models[0]
+
+    # Move models to GPU
+    for model in models:
+        model.eval()
+        if use_fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+
+    # Print args
+    logger.info(saved_cfg)
+
+    # Build criterion
+    criterion = task.build_criterion(saved_cfg.criterion)
+    criterion.eval()
+
+    for subset in cfg.dataset.valid_subset.split(","):
+        try:
+            task.load_dataset(subset, combine=False, epoch=1, task_cfg=saved_cfg.task)
+            dataset = task.dataset(subset)
+        except KeyError:
+            raise Exception("Cannot find dataset: " + subset)
+
+        # Initialize data iterator
+        itr = task.get_batch_iterator(
+            dataset=dataset,
+            max_tokens=cfg.dataset.max_tokens,
+            max_sentences=cfg.dataset.batch_size,
+            max_positions=utils.resolve_max_positions(
+                task.max_positions(),
+                *[m.max_positions() for m in models],
+            ),
+            ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=cfg.dataset.required_batch_size_multiple,
+            seed=cfg.common.seed,
+            num_shards=data_parallel_world_size,
+            shard_id=data_parallel_rank,
+            num_workers=cfg.dataset.num_workers,
+            data_buffer_size=cfg.dataset.data_buffer_size,
+        ).next_epoch_itr(shuffle=False)
+        progress = progress_bar.progress_bar(
+            itr,
+            log_format=cfg.common.log_format,
+            log_interval=cfg.common.log_interval,
+            prefix=f"valid on '{subset}' subset",
+            default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"),
+        )
+
+        log_outputs = []
+        for i, sample in enumerate(progress):
+            sample = utils.move_to_cuda(sample) if use_cuda else sample
+            _loss, _sample_size, log_output = task.valid_step(sample, model, criterion)
+            progress.log(log_output, step=i)
+            log_outputs.append(log_output)
+
+        if data_parallel_world_size > 1:
+            log_outputs = distributed_utils.all_gather_list(
+                log_outputs,
+                max_size=cfg.common.all_gather_list_size,
+                group=distributed_utils.get_data_parallel_group(),
+            )
+            log_outputs = list(chain.from_iterable(log_outputs))
+
+        with metrics.aggregate() as agg:
+            task.reduce_metrics(log_outputs, criterion)
+            log_output = agg.get_smoothed_values()
+
+        progress.print(log_output, tag=subset, step=i)
+
+
+def cli_main():
+    parser = options.get_validation_parser()
+    args = options.parse_args_and_arch(parser)
+
+    # only override args that are explicitly given on the command line
+    override_parser = options.get_validation_parser()
+    override_args = options.parse_args_and_arch(override_parser, suppress_defaults=True)
+
+    distributed_utils.call_main(
+        convert_namespace_to_omegaconf(args), main, override_args=override_args
+    )
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
new file mode 100644
index 0000000000..5949e274ed
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import functools
+import importlib
+
+
+dependencies = [
+    "dataclasses",
+    "hydra",
+    "numpy",
+    "omegaconf",
+    "regex",
+    "requests",
+    "torch",
+]
+
+
+# Check for required dependencies and raise a RuntimeError if any are missing.
+missing_deps = []
+for dep in dependencies:
+    try:
+        importlib.import_module(dep)
+    except ImportError:
+        # Hack: the hydra package is provided under the "hydra-core" name in
+        # pypi. We don't want the user mistakenly calling `pip install hydra`
+        # since that will install an unrelated package.
+        if dep == "hydra":
+            dep = "hydra-core"
+        missing_deps.append(dep)
+if len(missing_deps) > 0:
+    raise RuntimeError("Missing dependencies: {}".format(", ".join(missing_deps)))
+
+
+# only do fairseq imports after checking for dependencies
+from fairseq.hub_utils import (  # noqa; noqa
+    BPEHubInterface as bpe,
+    TokenizerHubInterface as tokenizer,
+)
+from fairseq.models import MODEL_REGISTRY  # noqa
+
+
+# torch.hub doesn't build Cython components, so if they are not found then try
+# to build them here
+try:
+    import fairseq.data.token_block_utils_fast  # noqa
+except ImportError:
+    try:
+        import cython  # noqa
+        import os
+        from setuptools import sandbox
+
+        sandbox.run_setup(
+            os.path.join(os.path.dirname(__file__), "setup.py"),
+            ["build_ext", "--inplace"],
+        )
+    except ImportError:
+        print(
+            "Unable to build Cython components. Please make sure Cython is "
+            "installed if the torch.hub model you are loading depends on it."
+        )
+
+
+# automatically expose models defined in FairseqModel::hub_models
+for _model_type, _cls in MODEL_REGISTRY.items():
+    for model_name in _cls.hub_models().keys():
+        globals()[model_name] = functools.partial(
+            _cls.from_pretrained,
+            model_name,
+        )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/pyproject.toml b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/pyproject.toml
new file mode 100644
index 0000000000..4d84c9bc36
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/pyproject.toml
@@ -0,0 +1,23 @@
+[build-system]
+requires = [
+  "setuptools>=18.0",
+  "wheel",
+  "cython",
+  "numpy>=1.21.3",
+  "torch>=1.10",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.black]
+extend-exclude = '''
+(
+^/examples/|
+^/fairseq/model_parallel/megatron|
+^/build/
+)
+'''
+
+[tool.isort]
+profile = "black"
+known_third_party = "_cffi_backend,agg_results,aml,bitarray,boto3,botocore,dump_hubert_feature,dynamicconv_cuda,editdistance,faiss,fasttext,feature_utils,ffmpeg,g2p_en,h5py,hydra,hypothesis,indicnlp,inflect,iopath,joblib,kaldi_io,kenlm,libfb,librosa,lightconv_cuda,matplotlib,misc,mmpt,mmpt_cli,model,nltk,npy_append_array,numpy,omegaconf,pandas,pathbuilder,preprocessing,progressbar,pythainlp,random_sequence_shuffler,regex,sacrebleu,sacremoses,scipy,sentencepiece,setuptools,six,sklearn,soundfile,sweep,sweep_wmt_en2de_transformer_big_common,tabulate,torch,torchaudio,tqdm,unidecode,utils,videoreader,wav2vec_cluster_faiss,wget,yaml"
+skip_gitignore = true
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py
new file mode 100644
index 0000000000..69a5e8dda3
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py
@@ -0,0 +1,72 @@
+import argparse
+from typing import Tuple
+
+
+def get_next_version(release_type) -> Tuple[Tuple[int, int, int], str, str]:
+    current_ver = find_version("fairseq/version.txt")
+    version_list = [int(x) for x in current_ver.strip("'").split(".")]
+    major, minor, patch = version_list[0], version_list[1], version_list[2]
+    if release_type == "patch":
+        patch += 1
+    elif release_type == "minor":
+        minor += 1
+        patch = 0
+    elif release_type == "major":
+        major += 1
+        minor = patch = 0
+    else:
+        raise ValueError(
+            "Incorrect release type specified. Acceptable types are major, minor and patch."
+        )
+
+    new_version_tuple = (major, minor, patch)
+    new_version_str = ".".join([str(x) for x in new_version_tuple])
+    new_tag_str = "v" + new_version_str
+    return new_version_tuple, new_version_str, new_tag_str
+
+
+def find_version(version_file_path) -> str:
+    with open(version_file_path) as f:
+        version = f.read().strip()
+        return version
+
+
+def update_version(new_version_str) -> None:
+    """
+    given the current version, update the version to the
+    next version depending on the type of release.
+    """
+
+    with open("fairseq/version.txt", "w") as writer:
+        writer.write(new_version_str)
+
+
+def main(args):
+    if args.release_type in ["major", "minor", "patch"]:
+        new_version_tuple, new_version, new_tag = get_next_version(args.release_type)
+    else:
+        raise ValueError("Incorrect release type specified")
+
+    if args.update_version:
+        update_version(new_version)
+
+    print(new_version, new_tag)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Versioning utils")
+    parser.add_argument(
+        "--release-type",
+        type=str,
+        required=True,
+        help="type of release = major/minor/patch",
+    )
+    parser.add_argument(
+        "--update-version",
+        action="store_true",
+        required=False,
+        help="updates the version in fairseq/version.txt",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
new file mode 100644
index 0000000000..49f4f9d912
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import collections
+import os
+import re
+
+import torch
+
+from fairseq.file_io import PathManager
+
+
+def average_checkpoints(inputs):
+    """Loads checkpoints from inputs and returns a model with averaged weights.
+
+    Args:
+      inputs: An iterable of string paths of checkpoints to load from.
+
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = collections.OrderedDict()
+    params_keys = None
+    new_state = None
+    num_models = len(inputs)
+
+    for fpath in inputs:
+        with PathManager.open(fpath, "rb") as f:
+            state = torch.load(
+                f,
+                map_location=(
+                    lambda s, _: torch.serialization.default_restore_location(s, "cpu")
+                ),
+            )
+        # Copies over the settings from the first checkpoint
+        if new_state is None:
+            new_state = state
+
+        model_params = state["model"]
+
+        model_params_keys = list(model_params.keys())
+        if params_keys is None:
+            params_keys = model_params_keys
+        elif params_keys != model_params_keys:
+            raise KeyError(
+                "For checkpoint {}, expected list of params: {}, "
+                "but found: {}".format(f, params_keys, model_params_keys)
+            )
+
+        for k in params_keys:
+            p = model_params[k]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if k not in params_dict:
+                params_dict[k] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                params_dict[k] += p
+
+    averaged_params = collections.OrderedDict()
+    for k, v in params_dict.items():
+        averaged_params[k] = v
+        if averaged_params[k].is_floating_point():
+            averaged_params[k].div_(num_models)
+        else:
+            averaged_params[k] //= num_models
+    new_state["model"] = averaged_params
+    return new_state
+
+
+def last_n_checkpoints(paths, n, update_based, upper_bound=None):
+    assert len(paths) == 1
+    path = paths[0]
+    if update_based:
+        pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
+    else:
+        pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
+    files = PathManager.ls(path)
+
+    entries = []
+    for f in files:
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            sort_key = int(m.group(1))
+            if upper_bound is None or sort_key <= upper_bound:
+                entries.append((sort_key, m.group(0)))
+    if len(entries) < n:
+        raise Exception(
+            "Found {} checkpoint files but need at least {}", len(entries), n
+        )
+    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Tool to average the params of input checkpoints to "
+        "produce a new checkpoint",
+    )
+    # fmt: off
+    parser.add_argument('--inputs', required=True, nargs='+',
+                        help='Input checkpoint file paths.')
+    parser.add_argument('--output', required=True, metavar='FILE',
+                        help='Write the new checkpoint containing the averaged weights to this path.')
+    num_group = parser.add_mutually_exclusive_group()
+    num_group.add_argument('--num-epoch-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the '
+                           'path specified by input, and average last this many of them.')
+    num_group.add_argument('--num-update-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by'
+                           ' input, and average last this many of them.')
+    num_group.add_argument('--num-best-checkpoints', type=int, default=0,
+                           help='if set, will try to find checkpoints with names checkpoint_best_ee_xx.pt in the path specified by'
+                           ' input, and average last this many of them.')
+    parser.add_argument('--checkpoint-upper-bound', type=int,
+                        help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
+                        'when using --num-update-checkpoints, this will set an upper bound on which update to use'
+                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be'
+                        ' averaged.'
+                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would'
+                        ' be averaged assuming --save-interval-updates 500'
+                        )
+    # fmt: on
+    args = parser.parse_args()
+    print(args)
+
+    num = None
+    is_update_based = False
+    if args.num_update_checkpoints is not None:
+        num = args.num_update_checkpoints
+        is_update_based = True
+    elif args.num_epoch_checkpoints is not None:
+        num = args.num_epoch_checkpoints
+
+    assert args.checkpoint_upper_bound is None or (
+        args.num_epoch_checkpoints is not None
+        or args.num_update_checkpoints is not None
+    ), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
+    assert (
+        args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
+    ), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"
+
+    if num is not None:
+        args.inputs = last_n_checkpoints(
+            args.inputs,
+            num,
+            is_update_based,
+            upper_bound=args.checkpoint_upper_bound,
+        )
+        print("averaging checkpoints: ", args.inputs)
+
+    if args.num_best_checkpoints > 0:
+        args.inputs = list(
+            sorted(
+                args.inputs,
+                key=lambda x: float(
+                    os.path.basename(x).split("_")[-1].replace(".pt", "")
+                ),
+            )
+        )
+        args.inputs = args.inputs[: args.num_best_checkpoints]
+        for path in args.inputs:
+            print(os.path.basename(path))
+    new_state = average_checkpoints(args.inputs)
+    with PathManager.open(args.output, "wb") as f:
+        torch.save(new_state, f)
+    print("Finished writing averaged checkpoint to {}".format(args.output))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
new file mode 100644
index 0000000000..0ca5c18f7b
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
@@ -0,0 +1,97 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Use this script in order to build symmetric alignments for your translation
+dataset.
+This script depends on fast_align and mosesdecoder tools. You will need to
+build those before running the script.
+fast_align:
+    github: http://github.com/clab/fast_align
+    instructions: follow the instructions in README.md
+mosesdecoder:
+    github: http://github.com/moses-smt/mosesdecoder
+    instructions: http://www.statmt.org/moses/?n=Development.GetStarted
+The script produces the following files under --output_dir:
+    text.joined - concatenation of lines from the source_file and the
+    target_file.
+    align.forward - forward pass of fast_align.
+    align.backward - backward pass of fast_align.
+    aligned.sym_heuristic - symmetrized alignment.
+"""
+
+import argparse
+import os
+from itertools import zip_longest
+
+
+def main():
+    parser = argparse.ArgumentParser(description="symmetric alignment builer")
+    # fmt: off
+    parser.add_argument('--fast_align_dir',
+                        help='path to fast_align build directory')
+    parser.add_argument('--mosesdecoder_dir',
+                        help='path to mosesdecoder root directory')
+    parser.add_argument('--sym_heuristic',
+                        help='heuristic to use for symmetrization',
+                        default='grow-diag-final-and')
+    parser.add_argument('--source_file',
+                        help='path to a file with sentences '
+                             'in the source language')
+    parser.add_argument('--target_file',
+                        help='path to a file with sentences '
+                             'in the target language')
+    parser.add_argument('--output_dir',
+                        help='output directory')
+    # fmt: on
+    args = parser.parse_args()
+
+    fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
+    symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
+    sym_fast_align_bin = os.path.join(
+        args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
+    )
+
+    # create joined file
+    joined_file = os.path.join(args.output_dir, "text.joined")
+    with open(args.source_file, "r", encoding="utf-8") as src, open(
+        args.target_file, "r", encoding="utf-8"
+    ) as tgt:
+        with open(joined_file, "w", encoding="utf-8") as joined:
+            for s, t in zip_longest(src, tgt):
+                print("{} ||| {}".format(s.strip(), t.strip()), file=joined)
+
+    bwd_align_file = os.path.join(args.output_dir, "align.backward")
+
+    # run forward alignment
+    fwd_align_file = os.path.join(args.output_dir, "align.forward")
+    fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
+        FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
+    )
+    assert os.system(fwd_fast_align_cmd) == 0
+
+    # run backward alignment
+    bwd_align_file = os.path.join(args.output_dir, "align.backward")
+    bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
+        FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
+    )
+    assert os.system(bwd_fast_align_cmd) == 0
+
+    # run symmetrization
+    sym_out_file = os.path.join(args.output_dir, "aligned")
+    sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
+        SYMFASTALIGN=sym_fast_align_bin,
+        FWD=fwd_align_file,
+        BWD=bwd_align_file,
+        SRC=args.source_file,
+        TGT=args.target_file,
+        OUT=sym_out_file,
+        HEURISTIC=args.sym_heuristic,
+        SYMAL=symal_bin,
+    )
+    assert os.system(sym_cmd) == 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py
new file mode 100644
index 0000000000..e5a9d9dd46
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+import os
+
+cwd = Path(".").resolve()
+print("running 'check_installation.py' from:", cwd)
+
+# Old versions of numpy/torch can prevent loading the .so files
+import torch
+
+print("torch:", torch.__version__)
+import numpy
+
+print("numpy:", numpy.__version__)
+
+import fairseq
+
+print("Fairseq installed at:", fairseq.__file__)
+import fairseq.criterions
+import fairseq.dataclass.configs
+
+import _imp
+
+print("Should load following .so suffixes:", _imp.extension_suffixes())
+
+so_files = list(Path(fairseq.__file__).parent.glob("*.so"))
+so_files.extend(Path(fairseq.__file__).parent.glob("data/*.so"))
+print("Found following .so files:")
+for so_file in so_files:
+    print(f"- {so_file}")
+
+from fairseq import libbleu
+
+print("Found libbleu at", libbleu.__file__)
+from fairseq.data import data_utils_fast
+
+print("Found data_utils_fast at", data_utils_fast.__file__)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py
new file mode 100644
index 0000000000..bc24db624f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+"""Helper script to compare two argparse.Namespace objects."""
+
+from argparse import Namespace  # noqa
+
+
+def main():
+
+    ns1 = eval(input("Namespace 1: "))
+    ns2 = eval(input("Namespace 2: "))
+
+    def keys(ns):
+        ks = set()
+        for k in dir(ns):
+            if not k.startswith("_"):
+                ks.add(k)
+        return ks
+
+    k1 = keys(ns1)
+    k2 = keys(ns2)
+
+    def print_keys(ks, ns1, ns2=None):
+        for k in ks:
+            if ns2 is None:
+                print("{}\t{}".format(k, getattr(ns1, k, None)))
+            else:
+                print(
+                    "{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None))
+                )
+
+    print("Keys unique to namespace 1:")
+    print_keys(k1 - k2, ns1)
+    print()
+
+    print("Keys unique to namespace 2:")
+    print_keys(k2 - k1, ns2)
+    print()
+
+    print("Overlapping keys with different values:")
+    ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")]
+    print_keys(ks, ns1, ns2)
+    print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compound_split_bleu.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compound_split_bleu.sh
new file mode 100644
index 0000000000..1972fddceb
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compound_split_bleu.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]; then
+    echo "usage: $0 GENERATE_PY_OUTPUT"
+    exit 1
+fi
+
+GEN=$1
+
+SYS=$GEN.sys
+REF=$GEN.ref
+
+if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then
+    echo "not done generating"
+    exit
+fi
+
+grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS
+grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF
+fairseq-score --sys $SYS --ref $REF
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
new file mode 100644
index 0000000000..437b373856
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Extracts random constraints from reference files."""
+
+import argparse
+import random
+import sys
+
+
+def get_phrase(words, index, length):
+    assert index < len(words) - length + 1
+    phr = " ".join(words[index : index + length])
+    for i in range(index, index + length):
+        words.pop(index)
+    return phr
+
+
+def main(args):
+
+    if args.seed:
+        random.seed(args.seed)
+
+    for line in sys.stdin:
+        constraints = []
+
+        def add_constraint(constraint):
+            constraints.append(constraint)
+
+        source = line.rstrip()
+        if "\t" in line:
+            source, target = line.split("\t")
+            if args.add_sos:
+                target = f"<s> {target}"
+            if args.add_eos:
+                target = f"{target} </s>"
+
+            if len(target.split()) >= args.len:
+                words = [target]
+
+                num = args.number
+
+                choices = {}
+                for i in range(num):
+                    if len(words) == 0:
+                        break
+                    segmentno = random.choice(range(len(words)))
+                    segment = words.pop(segmentno)
+                    tokens = segment.split()
+                    phrase_index = random.choice(range(len(tokens)))
+                    choice = " ".join(
+                        tokens[phrase_index : min(len(tokens), phrase_index + args.len)]
+                    )
+                    for j in range(
+                        phrase_index, min(len(tokens), phrase_index + args.len)
+                    ):
+                        tokens.pop(phrase_index)
+                    if phrase_index > 0:
+                        words.append(" ".join(tokens[0:phrase_index]))
+                    if phrase_index + 1 < len(tokens):
+                        words.append(" ".join(tokens[phrase_index:]))
+                    choices[target.find(choice)] = choice
+
+                    # mask out with spaces
+                    target = target.replace(choice, " " * len(choice), 1)
+
+                for key in sorted(choices.keys()):
+                    add_constraint(choices[key])
+
+        print(source, *constraints, sep="\t")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--number", "-n", type=int, default=1, help="number of phrases")
+    parser.add_argument("--len", "-l", type=int, default=1, help="phrase length")
+    parser.add_argument(
+        "--add-sos", default=False, action="store_true", help="add <s> token"
+    )
+    parser.add_argument(
+        "--add-eos", default=False, action="store_true", help="add </s> token"
+    )
+    parser.add_argument("--seed", "-s", default=0, type=int)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
new file mode 100644
index 0000000000..d531ad9f39
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+
+"""Reads in a fairseq output file, and verifies that the constraints
+(C- lines) are present in the output (the first H- line). Assumes that
+constraints are listed prior to the first hypothesis.
+"""
+
+constraints = []
+found = 0
+total = 0
+for line in sys.stdin:
+    if line.startswith("C-"):
+        constraints.append(line.rstrip().split("\t")[1])
+    elif line.startswith("H-"):
+        text = line.split("\t")[2]
+
+        for constraint in constraints:
+            total += 1
+            if constraint in text:
+                found += 1
+            else:
+                print(f"No {constraint} in {text}", file=sys.stderr)
+
+        constraints = []
+
+print(f"Found {found} / {total} = {100 * found / total:.1f}%")
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_dictionary.lua b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_dictionary.lua
new file mode 100644
index 0000000000..14ee8c997f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_dictionary.lua
@@ -0,0 +1,34 @@
+-- Copyright (c) Facebook, Inc. and its affiliates.
+--
+-- This source code is licensed under the MIT license found in the
+-- LICENSE file in the root directory of this source tree.
+--
+-- Usage: convert_dictionary.lua <dict.th7>
+require 'fairseq'
+require 'torch'
+require 'paths'
+
+if #arg < 1 then
+   print('usage: convert_dictionary.lua <dict.th7>')
+   os.exit(1)
+end
+if not paths.filep(arg[1]) then
+   print('error: file does not exit: ' .. arg[1])
+   os.exit(1)
+end
+
+dict = torch.load(arg[1])
+dst = paths.basename(arg[1]):gsub('.th7', '.txt')
+assert(dst:match('.txt$'))
+
+f = io.open(dst, 'w')
+for idx, symbol in ipairs(dict.index_to_symbol) do
+  if idx > dict.cutoff then
+    break
+  end
+  f:write(symbol)
+  f:write(' ')
+  f:write(dict.index_to_freq[idx])
+  f:write('\n')
+end
+f:close()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_model.lua b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_model.lua
new file mode 100644
index 0000000000..61b9213929
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/convert_model.lua
@@ -0,0 +1,108 @@
+-- Copyright (c) Facebook, Inc. and its affiliates.
+--
+-- This source code is licensed under the MIT license found in the
+-- LICENSE file in the root directory of this source tree.
+--
+-- Usage: convert_model.lua <model_epoch1.th7>
+require 'torch'
+local fairseq = require 'fairseq'
+
+model = torch.load(arg[1])
+
+function find_weight_norm(container, module)
+  for _, wn in ipairs(container:listModules()) do
+    if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then
+      return wn
+    end
+  end
+end
+
+function push_state(dict, key, module)
+  if torch.type(module) == 'nn.Linear' then
+    local wn = find_weight_norm(model.module, module)
+    assert(wn)
+    dict[key .. '.weight_v'] = wn.v:float()
+    dict[key .. '.weight_g'] = wn.g:float()
+  elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then
+    local wn = find_weight_norm(model.module, module)
+    assert(wn)
+    local v = wn.v:float():view(wn.viewOut):transpose(2, 3)
+    dict[key .. '.weight_v'] = v
+    dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1)
+  else
+    dict[key .. '.weight'] = module.weight:float()
+  end
+  if module.bias then
+    dict[key .. '.bias'] = module.bias:float()
+  end
+end
+
+encoder_dict = {}
+decoder_dict = {}
+combined_dict = {}
+
+function encoder_state(encoder)
+  luts = encoder:findModules('nn.LookupTable')
+  push_state(encoder_dict, 'embed_tokens', luts[1])
+  push_state(encoder_dict, 'embed_positions', luts[2])
+
+  fcs = encoder:findModules('nn.Linear')
+  assert(#fcs >= 2)
+  local nInputPlane = fcs[1].weight:size(1)
+  push_state(encoder_dict, 'fc1', table.remove(fcs, 1))
+  push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs))
+
+  for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do
+    push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module)
+    if nInputPlane ~= module.weight:size(3) / 2 then
+      push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
+    end
+    nInputPlane = module.weight:size(3) / 2
+  end
+  assert(#fcs == 0)
+end
+
+function decoder_state(decoder)
+  luts = decoder:findModules('nn.LookupTable')
+  push_state(decoder_dict, 'embed_tokens', luts[1])
+  push_state(decoder_dict, 'embed_positions', luts[2])
+
+  fcs = decoder:findModules('nn.Linear')
+  local nInputPlane = fcs[1].weight:size(1)
+  push_state(decoder_dict, 'fc1', table.remove(fcs, 1))
+  push_state(decoder_dict, 'fc2', fcs[#fcs - 1])
+  push_state(decoder_dict, 'fc3', fcs[#fcs])
+
+  table.remove(fcs, #fcs)
+  table.remove(fcs, #fcs)
+
+  for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do
+    if nInputPlane ~= module.weight:size(3) / 2 then
+      push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
+    end
+    nInputPlane = module.weight:size(3) / 2
+
+    local prefix = 'attention.' .. tostring(i - 1)
+    push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1))
+    push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1))
+    push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module)
+  end
+  assert(#fcs == 0)
+end
+
+
+_encoder = model.module.modules[2]
+_decoder = model.module.modules[3]
+
+encoder_state(_encoder)
+decoder_state(_decoder)
+
+for k, v in pairs(encoder_dict) do
+  combined_dict['encoder.' .. k] = v
+end
+for k, v in pairs(decoder_dict) do
+  combined_dict['decoder.' .. k] = v
+end
+
+
+torch.save('state_dict.t7', combined_dict)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
new file mode 100644
index 0000000000..58d85af85e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Count the number of documents and average number of lines and tokens per
+document in a large file. Documents should be separated by a single empty line.
+"""
+
+import argparse
+import gzip
+import sys
+
+import numpy as np
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("--gzip", action="store_true")
+    args = parser.parse_args()
+
+    def gopen():
+        if args.gzip:
+            return gzip.open(args.input, "r")
+        else:
+            return open(args.input, "r", encoding="utf-8")
+
+    num_lines = []
+    num_toks = []
+    with gopen() as h:
+        num_docs = 1
+        num_lines_in_doc = 0
+        num_toks_in_doc = 0
+        for i, line in enumerate(h):
+            if len(line.strip()) == 0:  # empty line indicates new document
+                num_docs += 1
+                num_lines.append(num_lines_in_doc)
+                num_toks.append(num_toks_in_doc)
+                num_lines_in_doc = 0
+                num_toks_in_doc = 0
+            else:
+                num_lines_in_doc += 1
+                num_toks_in_doc += len(line.rstrip().split())
+            if i % 1000000 == 0:
+                print(i, file=sys.stderr, end="", flush=True)
+            elif i % 100000 == 0:
+                print(".", file=sys.stderr, end="", flush=True)
+        print(file=sys.stderr, flush=True)
+
+    print("found {} docs".format(num_docs))
+    print("average num lines per doc: {}".format(np.mean(num_lines)))
+    print("average num toks per doc: {}".format(np.mean(num_toks)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
new file mode 100644
index 0000000000..a414095d03
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from fairseq.data import Dictionary, data_utils, indexed_dataset
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="writes text from binarized file to stdout"
+    )
+    # fmt: off
+    parser.add_argument('--dataset-impl', help='dataset implementation',
+                        choices=indexed_dataset.get_available_dataset_impl())
+    parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
+    parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
+    # fmt: on
+
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    dictionary = Dictionary.load(args.dict) if args.dict is not None else None
+    dataset = data_utils.load_indexed_dataset(
+        args.input,
+        dictionary,
+        dataset_impl=args.dataset_impl,
+        default="lazy",
+    )
+
+    for tensor_line in dataset:
+        if dictionary is None:
+            line = " ".join([str(int(x)) for x in tensor_line])
+        else:
+            line = dictionary.string(tensor_line)
+
+        print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
new file mode 100644
index 0000000000..6cd063d21f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import re
+import shutil
+import sys
+
+
+pt_regexp = re.compile(r"checkpoint(\d+|_\d+_\d+|_[a-z]+)\.pt")
+pt_regexp_epoch_based = re.compile(r"checkpoint(\d+)\.pt")
+pt_regexp_update_based = re.compile(r"checkpoint_\d+_(\d+)\.pt")
+
+
+def parse_checkpoints(files):
+    entries = []
+    for f in files:
+        m = pt_regexp_epoch_based.fullmatch(f)
+        if m is not None:
+            entries.append((int(m.group(1)), m.group(0)))
+        else:
+            m = pt_regexp_update_based.fullmatch(f)
+            if m is not None:
+                entries.append((int(m.group(1)), m.group(0)))
+    return entries
+
+
+def last_n_checkpoints(files, n):
+    entries = parse_checkpoints(files)
+    return [x[1] for x in sorted(entries, reverse=True)[:n]]
+
+
+def every_n_checkpoints(files, n):
+    entries = parse_checkpoints(files)
+    return [x[1] for x in sorted(sorted(entries)[::-n])]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Recursively delete checkpoint files from `root_dir`, "
+            "but preserve checkpoint_best.pt and checkpoint_last.pt"
+        )
+    )
+    parser.add_argument("root_dirs", nargs="*")
+    parser.add_argument(
+        "--save-last", type=int, default=0, help="number of last checkpoints to save"
+    )
+    parser.add_argument(
+        "--save-every", type=int, default=0, help="interval of checkpoints to save"
+    )
+    parser.add_argument(
+        "--preserve-test",
+        action="store_true",
+        help="preserve checkpoints in dirs that start with test_ prefix (default: delete them)",
+    )
+    parser.add_argument(
+        "--delete-best", action="store_true", help="delete checkpoint_best.pt"
+    )
+    parser.add_argument(
+        "--delete-last", action="store_true", help="delete checkpoint_last.pt"
+    )
+    parser.add_argument(
+        "--no-dereference", action="store_true", help="don't dereference symlinks"
+    )
+    args = parser.parse_args()
+
+    files_to_desymlink = []
+    files_to_preserve = []
+    files_to_delete = []
+    for root_dir in args.root_dirs:
+        for root, _subdirs, files in os.walk(root_dir):
+            if args.save_last > 0:
+                to_save = last_n_checkpoints(files, args.save_last)
+            else:
+                to_save = []
+            if args.save_every > 0:
+                to_save += every_n_checkpoints(files, args.save_every)
+            for file in files:
+                if not pt_regexp.fullmatch(file):
+                    continue
+                full_path = os.path.join(root, file)
+                if (
+                    not os.path.basename(root).startswith("test_") or args.preserve_test
+                ) and (
+                    (file == "checkpoint_last.pt" and not args.delete_last)
+                    or (file == "checkpoint_best.pt" and not args.delete_best)
+                    or file in to_save
+                ):
+                    if os.path.islink(full_path) and not args.no_dereference:
+                        files_to_desymlink.append(full_path)
+                    else:
+                        files_to_preserve.append(full_path)
+                else:
+                    files_to_delete.append(full_path)
+
+    if len(files_to_desymlink) == 0 and len(files_to_delete) == 0:
+        print("Nothing to do.")
+        sys.exit(0)
+
+    files_to_desymlink = sorted(files_to_desymlink)
+    files_to_preserve = sorted(files_to_preserve)
+    files_to_delete = sorted(files_to_delete)
+
+    print("Operations to perform (in order):")
+    if len(files_to_desymlink) > 0:
+        for file in files_to_desymlink:
+            print(" - preserve (and dereference symlink): " + file)
+    if len(files_to_preserve) > 0:
+        for file in files_to_preserve:
+            print(" - preserve: " + file)
+    if len(files_to_delete) > 0:
+        for file in files_to_delete:
+            print(" - delete: " + file)
+    while True:
+        resp = input("Continue? (Y/N): ")
+        if resp.strip().lower() == "y":
+            break
+        elif resp.strip().lower() == "n":
+            sys.exit(0)
+
+    print("Executing...")
+    if len(files_to_desymlink) > 0:
+        for file in files_to_desymlink:
+            realpath = os.path.realpath(file)
+            print("rm " + file)
+            os.remove(file)
+            print("cp {} {}".format(realpath, file))
+            shutil.copyfile(realpath, file)
+    if len(files_to_delete) > 0:
+        for file in files_to_delete:
+            print("rm " + file)
+            os.remove(file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/sacrebleu.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/sacrebleu.sh
new file mode 100644
index 0000000000..c10bf2b76e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/sacrebleu.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+    echo "usage: $0 TESTSET SRCLANG TGTLANG GEN"
+    exit 1
+fi
+
+TESTSET=$1
+SRCLANG=$2
+TGTLANG=$3
+
+GEN=$4
+
+if ! command -v sacremoses &> /dev/null
+then
+    echo "sacremoses could not be found, please install with: pip install sacremoses"
+    exit
+fi
+
+grep ^H $GEN \
+| sed 's/^H\-//' \
+| sort -n -k 1 \
+| cut -f 3 \
+| sacremoses detokenize \
+> $GEN.sorted.detok
+
+sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
new file mode 100644
index 0000000000..97232c3c84
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Split a large file into shards while respecting document boundaries. Documents
+should be separated by a single empty line.
+"""
+
+import argparse
+import contextlib
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("--num-shards", type=int)
+    args = parser.parse_args()
+
+    assert args.num_shards is not None and args.num_shards > 1
+
+    with open(args.input, "r", encoding="utf-8") as h:
+        with contextlib.ExitStack() as stack:
+            outputs = [
+                stack.enter_context(
+                    open(args.input + ".shard" + str(i), "w", encoding="utf-8")
+                )
+                for i in range(args.num_shards)
+            ]
+
+            doc = []
+            first_doc = [True] * args.num_shards
+
+            def output_doc(i):
+                if not first_doc[i]:
+                    outputs[i].write("\n")
+                first_doc[i] = False
+                for line in doc:
+                    outputs[i].write(line)
+                doc.clear()
+
+            num_docs = 0
+            for line in h:
+                if line.strip() == "":  # empty line indicates new document
+                    output_doc(num_docs % args.num_shards)
+                    num_docs += 1
+                else:
+                    doc.append(line)
+            output_doc(num_docs % args.num_shards)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
new file mode 100644
index 0000000000..ff15978528
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Split a large file into a train and valid set while respecting document
+boundaries. Documents should be separated by a single empty line.
+"""
+
+import argparse
+import random
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("sample_output", help="train output file")
+    parser.add_argument("remainder_output", help="valid output file")
+    parser.add_argument("-k", type=int, help="remainder size")
+    parser.add_argument(
+        "--lines", action="store_true", help="split lines instead of docs"
+    )
+    args = parser.parse_args()
+
+    assert args.k is not None
+
+    sample = []
+    remainder = []
+    num_docs = [0]
+
+    def update_sample(doc):
+        if len(sample) < args.k:
+            sample.append(doc.copy())
+        else:
+            i = num_docs[0]
+            j = random.randrange(i + 1)
+            if j < args.k:
+                remainder.append(sample[j])
+                sample[j] = doc.copy()
+            else:
+                remainder.append(doc.copy())
+        num_docs[0] += 1
+        doc.clear()
+
+    with open(args.input, "r", encoding="utf-8") as h:
+        doc = []
+        for i, line in enumerate(h):
+            if line.strip() == "":  # empty line indicates new document
+                update_sample(doc)
+            else:
+                doc.append(line)
+            if args.lines:
+                update_sample(doc)
+            if i % 1000000 == 0:
+                print(i, file=sys.stderr, end="", flush=True)
+            elif i % 100000 == 0:
+                print(".", file=sys.stderr, end="", flush=True)
+        if len(doc) > 0:
+            update_sample(doc)
+    print(file=sys.stderr, flush=True)
+
+    assert len(sample) == args.k
+
+    with open(args.sample_output, "w", encoding="utf-8") as out:
+        first = True
+        for doc in sample:
+            if not first and not args.lines:
+                out.write("\n")
+            first = False
+            for line in doc:
+                out.write(line)
+
+    with open(args.remainder_output, "w", encoding="utf-8") as out:
+        first = True
+        for doc in remainder:
+            if not first and not args.lines:
+                out.write("\n")
+            first = False
+            for line in doc:
+                out.write(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py
new file mode 100644
index 0000000000..7d7b68b240
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="sentencepiece model to use for decoding"
+    )
+    parser.add_argument("--input", required=True, help="input file to decode")
+    parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
+    args = parser.parse_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.input_format == "piece":
+
+        def decode(input):
+            return "".join(sp.DecodePieces(input))
+
+    elif args.input_format == "id":
+
+        def decode(input):
+            return "".join(sp.DecodeIds(input))
+
+    else:
+        raise NotImplementedError
+
+    def tok2int(tok):
+        # remap reference-side <unk> (represented as <<unk>>) to 0
+        return int(tok) if tok != "<<unk>>" else 0
+
+    with open(args.input, "r", encoding="utf-8") as h:
+        for line in h:
+            if args.input_format == "id":
+                print(decode(list(map(tok2int, line.rstrip().split()))))
+            elif args.input_format == "piece":
+                print(decode(line.rstrip().split()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py
new file mode 100644
index 0000000000..f91e0bb728
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="sentencepiece model to use for encoding"
+    )
+    parser.add_argument(
+        "--inputs", nargs="+", default=["-"], help="input files to filter/encode"
+    )
+    parser.add_argument(
+        "--outputs", nargs="+", default=["-"], help="path to save encoded outputs"
+    )
+    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+    parser.add_argument(
+        "--min-len",
+        type=int,
+        metavar="N",
+        help="filter sentence pairs with fewer than N tokens",
+    )
+    parser.add_argument(
+        "--max-len",
+        type=int,
+        metavar="N",
+        help="filter sentence pairs with more than N tokens",
+    )
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(
+        args.outputs
+    ), "number of input and output paths should match"
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.output_format == "piece":
+
+        def encode(input):
+            return sp.EncodeAsPieces(input)
+
+    elif args.output_format == "id":
+
+        def encode(input):
+            return list(map(str, sp.EncodeAsIds(input)))
+
+    else:
+        raise NotImplementedError
+
+    if args.min_len is not None or args.max_len is not None:
+
+        def valid(line):
+            return (args.min_len is None or len(line) >= args.min_len) and (
+                args.max_len is None or len(line) <= args.max_len
+            )
+
+    else:
+
+        def valid(lines):
+            return True
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-"
+            else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-"
+            else sys.stdout
+            for output in args.outputs
+        ]
+
+        stats = {
+            "num_empty": 0,
+            "num_filtered": 0,
+        }
+
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+
+        for i, lines in enumerate(zip(*inputs), start=1):
+            enc_lines = list(map(encode_line, lines))
+            if not any(enc_line is None for enc_line in enc_lines):
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(" ".join(enc_line), file=output_h)
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py
new file mode 100644
index 0000000000..9db668fd41
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+
+import sentencepiece as spm
+
+
+if __name__ == "__main__":
+    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/test_fsdp.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/test_fsdp.sh
new file mode 100644
index 0000000000..1f428a035e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/test_fsdp.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+rm -rf fsdp_dummy
+mkdir -p fsdp_dummy
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
+    --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+    --cpu-offload --checkpoint-activations \
+    --task language_modeling --tokens-per-sample 256 --batch-size 8 \
+    --arch transformer_lm_gpt2_tiny \
+    --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+    --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+    --max-update 5 --log-format json --log-interval 1 \
+    --save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \
+    --restore-file x.pt "$@"
+
+# Now we try to load the checkpoint
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
+    --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+    --cpu-offload --checkpoint-activations \
+    --task language_modeling --tokens-per-sample 256 --batch-size 8 \
+    --arch transformer_lm_gpt2_tiny \
+    --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+    --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+    --max-update 2 --log-format json --log-interval 1 \
+    --save-interval-updates 2 --save-dir fsdp_dummy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.cfg b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.cfg
new file mode 100644
index 0000000000..3fa679ddf1
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.cfg
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 127
+extend-ignore = E203, W503
+extend-exclude = fairseq/model_parallel/megatron
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
new file mode 100644
index 0000000000..dae06080c5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import subprocess
+import sys
+
+from setuptools import Extension, find_packages, setup
+from torch.utils import cpp_extension
+
+if sys.version_info < (3, 6):
+    sys.exit("Sorry, Python >= 3.6 is required for fairseq.")
+
+
+def write_version_py():
+    with open(os.path.join("fairseq", "version.txt")) as f:
+        version = f.read().strip()
+
+    # write version info to fairseq/version.py
+    with open(os.path.join("fairseq", "version.py"), "w") as f:
+        f.write('__version__ = "{}"\n'.format(version))
+    return version
+
+
+version = write_version_py()
+
+
+with open("README.md") as f:
+    readme = f.read()
+
+
+if sys.platform == "darwin":
+    extra_compile_args = ["-stdlib=libc++", "-O3"]
+else:
+    extra_compile_args = ["-std=c++11", "-O3"]
+
+
+class NumpyExtension(Extension):
+    """Source: https://stackoverflow.com/a/54128391"""
+
+    def __init__(self, *args, **kwargs):
+        self.__include_dirs = []
+        super().__init__(*args, **kwargs)
+
+    @property
+    def include_dirs(self):
+        import numpy
+
+        return self.__include_dirs + [numpy.get_include()]
+
+    @include_dirs.setter
+    def include_dirs(self, dirs):
+        self.__include_dirs = dirs
+
+
+extensions = [
+    Extension(
+        "fairseq.libbleu",
+        sources=[
+            "fairseq/clib/libbleu/libbleu.cpp",
+            "fairseq/clib/libbleu/module.cpp",
+        ],
+        extra_compile_args=extra_compile_args,
+    ),
+    NumpyExtension(
+        "fairseq.data.data_utils_fast",
+        sources=["fairseq/data/data_utils_fast.pyx"],
+        language="c++",
+        extra_compile_args=extra_compile_args,
+    ),
+    NumpyExtension(
+        "fairseq.data.token_block_utils_fast",
+        sources=["fairseq/data/token_block_utils_fast.pyx"],
+        language="c++",
+        extra_compile_args=extra_compile_args,
+    ),
+]
+
+
+extensions.extend(
+    [
+        cpp_extension.CppExtension(
+            "fairseq.libbase",
+            sources=[
+                "fairseq/clib/libbase/balanced_assignment.cpp",
+            ],
+        ),
+        cpp_extension.CppExtension(
+            "fairseq.libnat",
+            sources=[
+                "fairseq/clib/libnat/edit_dist.cpp",
+            ],
+        ),
+        cpp_extension.CppExtension(
+            "alignment_train_cpu_binding",
+            sources=[
+                "examples/operators/alignment_train_cpu.cpp",
+            ],
+        ),
+    ]
+)
+if "CUDA_HOME" in os.environ:
+    extensions.extend(
+        [
+            cpp_extension.CppExtension(
+                "fairseq.libnat_cuda",
+                sources=[
+                    "fairseq/clib/libnat_cuda/edit_dist.cu",
+                    "fairseq/clib/libnat_cuda/binding.cpp",
+                ],
+            ),
+            cpp_extension.CppExtension(
+                "fairseq.ngram_repeat_block_cuda",
+                sources=[
+                    "fairseq/clib/cuda/ngram_repeat_block_cuda.cpp",
+                    "fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu",
+                ],
+            ),
+            cpp_extension.CppExtension(
+                "alignment_train_cuda_binding",
+                sources=[
+                    "examples/operators/alignment_train_kernel.cu",
+                    "examples/operators/alignment_train_cuda.cpp",
+                ],
+            ),
+        ]
+    )
+
+cmdclass = {"build_ext": cpp_extension.BuildExtension}
+
+if "READTHEDOCS" in os.environ:
+    # don't build extensions when generating docs
+    extensions = []
+    if "build_ext" in cmdclass:
+        del cmdclass["build_ext"]
+
+    # use CPU build of PyTorch
+    dependency_links = [
+        "https://download.pytorch.org/whl/cpu/torch-1.7.0%2Bcpu-cp36-cp36m-linux_x86_64.whl"
+    ]
+else:
+    dependency_links = []
+
+
+if "clean" in sys.argv[1:]:
+    # Source: https://bit.ly/2NLVsgE
+    print("deleting Cython files...")
+
+    subprocess.run(
+        ["rm -f fairseq/*.so fairseq/**/*.so fairseq/*.pyd fairseq/**/*.pyd"],
+        shell=True,
+    )
+
+
+extra_packages = []
+if os.path.exists(os.path.join("fairseq", "model_parallel", "megatron", "mpu")):
+    extra_packages.append("fairseq.model_parallel.megatron.mpu")
+
+
+def do_setup(package_data):
+    setup(
+        name="fairseq",
+        version=version,
+        description="Facebook AI Research Sequence-to-Sequence Toolkit",
+        url="https://github.com/pytorch/fairseq",
+        classifiers=[
+            "Intended Audience :: Science/Research",
+            "License :: OSI Approved :: MIT License",
+            "Programming Language :: Python :: 3.6",
+            "Programming Language :: Python :: 3.7",
+            "Programming Language :: Python :: 3.8",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        ],
+        long_description=readme,
+        long_description_content_type="text/markdown",
+        install_requires=[
+            "cffi",
+            "cython",
+            "hydra-core>=1.0.7,<1.1",
+            "omegaconf<2.1",
+            "numpy>=1.21.3",
+            "regex",
+            "sacrebleu>=1.4.12",
+            "torch>=1.13",
+            "tqdm",
+            "bitarray",
+            "torchaudio>=0.8.0",
+            "scikit-learn",
+            "packaging",
+        ],
+        extras_require={
+            "dev": ["flake8", "pytest", "black==22.3.0"],
+            "docs": ["sphinx", "sphinx-argparse"],
+        },
+        dependency_links=dependency_links,
+        packages=find_packages(
+            exclude=[
+                "examples",
+                "examples.*",
+                "scripts",
+                "scripts.*",
+                "tests",
+                "tests.*",
+            ]
+        )
+        + extra_packages,
+        package_data=package_data,
+        ext_modules=extensions,
+        test_suite="tests",
+        entry_points={
+            "console_scripts": [
+                "fairseq-eval-lm = fairseq_cli.eval_lm:cli_main",
+                "fairseq-generate = fairseq_cli.generate:cli_main",
+                "fairseq-hydra-train = fairseq_cli.hydra_train:cli_main",
+                "fairseq-interactive = fairseq_cli.interactive:cli_main",
+                "fairseq-preprocess = fairseq_cli.preprocess:cli_main",
+                "fairseq-score = fairseq_cli.score:cli_main",
+                "fairseq-train = fairseq_cli.train:cli_main",
+                "fairseq-validate = fairseq_cli.validate:cli_main",
+            ],
+        },
+        cmdclass=cmdclass,
+        zip_safe=False,
+    )
+
+
+def get_files(path, relative_to="fairseq"):
+    all_files = []
+    for root, _dirs, files in os.walk(path, followlinks=True):
+        root = os.path.relpath(root, relative_to)
+        for file in files:
+            if file.endswith(".pyc"):
+                continue
+            all_files.append(os.path.join(root, file))
+    return all_files
+
+
+if __name__ == "__main__":
+    try:
+        # symlink examples into fairseq package so package_data accepts them
+        fairseq_examples = os.path.join("fairseq", "examples")
+        if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples):
+            os.symlink(os.path.join("..", "examples"), fairseq_examples)
+
+        package_data = {
+            "fairseq": (
+                get_files(fairseq_examples)
+                + get_files(os.path.join("fairseq", "config"))
+            )
+        }
+        do_setup(package_data)
+    finally:
+        if "build_ext" not in sys.argv[1:] and os.path.islink(fairseq_examples):
+            os.unlink(fairseq_examples)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
new file mode 100644
index 0000000000..321de3d9b5
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead.
+"""
+
+from fairseq_cli.train import cli_main
+
+
+if __name__ == "__main__":
+    cli_main()
-- 
Gitee


From 190736c917e60d5abbde8c258a449749281c3cc1 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Fri, 9 Jun 2023 16:58:11 +0800
Subject: [PATCH 02/12] license remove

---
 .../docs/Makefile                             | 20 ----
 .../docs/conf.py                              | 98 -------------------
 .../docs/docutils.conf                        |  2 -
 .../docs/make.bat                             | 36 -------
 .../fairseq/__init__.py                       |  4 -
 .../fairseq/benchmark/__init__.py             |  4 -
 .../benchmark_multihead_attention.py          |  4 -
 .../fairseq/benchmark/dummy_lm.py             |  4 -
 .../fairseq/benchmark/dummy_masked_lm.py      |  4 -
 .../fairseq/benchmark/dummy_model.py          |  4 -
 .../fairseq/benchmark/dummy_mt.py             |  4 -
 .../fairseq/binarizer.py                      |  4 -
 .../fairseq/checkpoint_utils.py               |  4 -
 .../fairseq/config/__init__.py                |  4 -
 .../fairseq/criterions/__init__.py            |  4 -
 .../fairseq/criterions/adaptive_loss.py       |  4 -
 .../fairseq/criterions/composite_loss.py      |  4 -
 .../fairseq/criterions/cross_entropy.py       |  4 -
 .../fairseq/criterions/fairseq_criterion.py   |  4 -
 .../fairseq/criterions/hubert_criterion.py    |  4 -
 .../label_smoothed_cross_entropy.py           |  4 -
 ...moothed_cross_entropy_latency_augmented.py |  4 -
 ...l_smoothed_cross_entropy_with_alignment.py |  4 -
 .../label_smoothed_cross_entropy_with_ctc.py  |  4 -
 ...label_smoothed_cross_entropy_with_rdrop.py |  4 -
 .../fairseq/criterions/legacy_masked_lm.py    |  4 -
 .../fairseq/criterions/masked_lm.py           |  4 -
 .../fairseq/criterions/model_criterion.py     |  4 -
 .../fairseq/criterions/nat_loss.py            |  4 -
 .../fairseq/criterions/sentence_prediction.py |  4 -
 .../sentence_prediction_adapters.py           |  4 -
 .../fairseq/criterions/sentence_ranking.py    |  4 -
 .../criterions/speech_dlm_criterion.py        |  4 -
 .../criterions/speech_to_speech_criterion.py  |  4 -
 .../criterions/speech_ulm_criterion.py        |  4 -
 .../fairseq/criterions/wav2vec_criterion.py   |  4 -
 .../fairseq/data/__init__.py                  |  4 -
 .../fairseq/data/add_class_target_dataset.py  |  4 -
 .../fairseq/data/add_target_dataset.py        |  4 -
 .../fairseq/data/append_token_dataset.py      |  4 -
 .../fairseq/data/audio/audio_utils.py         |  4 -
 .../fairseq/data/audio/data_cfg.py            |  4 -
 .../fairseq/data/audio/hubert_dataset.py      |  4 -
 .../fairseq/data/audio/raw_audio_dataset.py   |  4 -
 .../data/audio/speech_to_speech_dataset.py    |  4 -
 .../data/audio/speech_to_text_dataset.py      |  4 -
 .../audio/speech_to_text_joint_dataset.py     |  4 -
 .../fairseq/data/backtranslation_dataset.py   |  4 -
 .../fairseq/data/base_wrapper_dataset.py      |  4 -
 .../fairseq/data/bucket_pad_length_dataset.py |  4 -
 .../fairseq/data/codedataset.py               |  4 -
 .../fairseq/data/colorize_dataset.py          |  4 -
 .../fairseq/data/concat_dataset.py            |  4 -
 .../fairseq/data/concat_sentences_dataset.py  |  4 -
 .../fairseq/data/data_utils.py                |  4 -
 .../fairseq/data/data_utils_fast.pyx          |  4 -
 .../fairseq/data/denoising_dataset.py         |  4 -
 .../fairseq/data/dictionary.py                |  4 -
 .../fairseq/data/encoders/__init__.py         |  4 -
 .../fairseq/data/encoders/byte_bpe.py         |  4 -
 .../fairseq/data/encoders/byte_utils.py       |  4 -
 .../fairseq/data/encoders/bytes.py            |  4 -
 .../fairseq/data/encoders/characters.py       |  4 -
 .../fairseq/data/encoders/fastbpe.py          |  4 -
 .../fairseq/data/encoders/gpt2_bpe.py         |  4 -
 .../fairseq/data/encoders/hf_bert_bpe.py      |  4 -
 .../fairseq/data/encoders/hf_byte_bpe.py      |  4 -
 .../fairseq/data/encoders/moses_tokenizer.py  |  4 -
 .../fairseq/data/encoders/nltk_tokenizer.py   |  4 -
 .../data/encoders/sentencepiece_bpe.py        |  4 -
 .../fairseq/data/encoders/space_tokenizer.py  |  4 -
 .../fairseq/data/encoders/subword_nmt_bpe.py  |  4 -
 .../fairseq/data/encoders/utils.py            |  4 -
 .../fairseq/data/fairseq_dataset.py           |  4 -
 .../fairseq/data/fasta_dataset.py             |  4 -
 .../fairseq/data/huffman/__init__.py          |  4 -
 .../fairseq/data/huffman/huffman_coder.py     |  4 -
 .../huffman/huffman_mmap_indexed_dataset.py   |  4 -
 .../fairseq/data/id_dataset.py                |  4 -
 .../fairseq/data/indexed_dataset.py           |  4 -
 .../fairseq/data/iterators.py                 |  4 -
 .../fairseq/data/language_pair_dataset.py     |  4 -
 .../fairseq/data/legacy/__init__.py           |  4 -
 .../fairseq/data/legacy/block_pair_dataset.py |  4 -
 .../fairseq/data/legacy/masked_lm_dataset.py  |  4 -
 .../data/legacy/masked_lm_dictionary.py       |  4 -
 .../fairseq/data/list_dataset.py              |  4 -
 .../fairseq/data/lm_context_window_dataset.py |  4 -
 .../fairseq/data/lru_cache_dataset.py         |  4 -
 .../fairseq/data/mask_tokens_dataset.py       |  4 -
 .../fairseq/data/monolingual_dataset.py       |  4 -
 .../fairseq/data/multi_corpus_dataset.py      |  4 -
 .../data/multi_corpus_sampled_dataset.py      |  4 -
 .../fairseq/data/multilingual/__init__.py     |  4 -
 .../multilingual/multilingual_data_manager.py |  4 -
 .../multilingual/sampled_multi_dataset.py     |  4 -
 .../sampled_multi_epoch_dataset.py            |  4 -
 .../data/multilingual/sampling_method.py      |  4 -
 .../fairseq/data/nested_dictionary_dataset.py |  4 -
 .../fairseq/data/noising.py                   |  4 -
 .../fairseq/data/num_samples_dataset.py       |  4 -
 .../fairseq/data/numel_dataset.py             |  4 -
 .../fairseq/data/offset_tokens_dataset.py     |  4 -
 .../fairseq/data/pad_dataset.py               |  4 -
 .../fairseq/data/padding_mask_dataset.py      |  4 -
 .../fairseq/data/plasma_utils.py              |  4 -
 .../fairseq/data/prepend_dataset.py           |  4 -
 .../fairseq/data/prepend_token_dataset.py     |  4 -
 .../fairseq/data/raw_label_dataset.py         |  4 -
 .../fairseq/data/replace_dataset.py           |  4 -
 .../fairseq/data/resampling_dataset.py        |  4 -
 .../fairseq/data/roll_dataset.py              |  4 -
 .../fairseq/data/round_robin_zip_datasets.py  |  4 -
 .../fairseq/data/shorten_dataset.py           |  4 -
 .../fairseq/data/sort_dataset.py              |  4 -
 .../fairseq/data/span_mask_tokens_dataset.py  |  4 -
 .../fairseq/data/speech_dlm_dataset.py        |  4 -
 .../fairseq/data/strip_token_dataset.py       |  4 -
 .../fairseq/data/subsample_dataset.py         |  4 -
 .../fairseq/data/text_compressor.py           |  4 -
 .../fairseq/data/token_block_dataset.py       |  4 -
 .../fairseq/data/token_block_utils_fast.pyx   |  4 -
 .../transform_eos_concat_langpair_dataset.py  |  4 -
 .../fairseq/data/transform_eos_dataset.py     |  4 -
 .../data/transform_eos_lang_pair_dataset.py   |  4 -
 .../fairseq/dataclass/__init__.py             |  4 -
 .../fairseq/dataclass/configs.py              |  4 -
 .../fairseq/dataclass/constants.py            |  4 -
 .../fairseq/dataclass/initialize.py           |  4 -
 .../fairseq/dataclass/utils.py                |  4 -
 .../fairseq/distributed/__init__.py           |  4 -
 .../distributed_timeout_wrapper.py            |  4 -
 .../fully_sharded_data_parallel.py            |  4 -
 .../legacy_distributed_data_parallel.py       |  4 -
 .../distributed/module_proxy_wrapper.py       |  4 -
 .../tpu_distributed_data_parallel.py          |  4 -
 .../fairseq/distributed/utils.py              |  4 -
 .../fairseq/file_chunker_utils.py             |  4 -
 .../fairseq/file_io.py                        |  4 -
 .../fairseq/file_utils.py                     |  4 -
 .../fairseq/hub_utils.py                      |  4 -
 .../fairseq/incremental_decoding_utils.py     |  4 -
 .../fairseq/iterative_refinement_generator.py |  4 -
 .../fairseq/logging/meters.py                 |  4 -
 .../fairseq/logging/metrics.py                |  4 -
 .../fairseq/logging/progress_bar.py           |  4 -
 .../fairseq/model_parallel/__init__.py        |  4 -
 .../model_parallel/criterions/__init__.py     |  4 -
 .../vocab_parallel_cross_entropy.py           |  4 -
 .../model_parallel/megatron_trainer.py        |  4 -
 .../fairseq/model_parallel/models/__init__.py |  4 -
 .../pipeline_parallel_transformer/__init__.py |  4 -
 .../pipeline_parallel_transformer/layers.py   |  4 -
 .../pipeline_parallel_transformer/model.py    |  4 -
 .../model_parallel/models/transformer.py      |  4 -
 .../model_parallel/models/transformer_lm.py   |  4 -
 .../model_parallel/modules/__init__.py        |  4 -
 .../modules/multihead_attention.py            |  4 -
 .../modules/transformer_layer.py              |  4 -
 .../fairseq/models/__init__.py                |  4 -
 .../fairseq/models/bart/__init__.py           |  4 -
 .../fairseq/models/bart/hub_interface.py      |  4 -
 .../fairseq/models/bart/model.py              |  4 -
 .../fairseq/models/composite_encoder.py       |  4 -
 .../models/distributed_fairseq_model.py       |  4 -
 .../fairseq/models/ema/__init__.py            |  4 -
 .../fairseq/models/fairseq_decoder.py         |  4 -
 .../fairseq/models/fairseq_encoder.py         |  4 -
 .../models/fairseq_incremental_decoder.py     |  4 -
 .../fairseq/models/fairseq_model.py           |  4 -
 .../fairseq/models/fconv.py                   |  4 -
 .../fairseq/models/fconv_lm.py                |  4 -
 .../fairseq/models/fconv_self_att.py          |  4 -
 .../fairseq/models/hubert/__init__.py         |  4 -
 .../fairseq/models/hubert/hubert.py           |  4 -
 .../fairseq/models/hubert/hubert_asr.py       |  4 -
 .../fairseq/models/huggingface/__init__.py    |  4 -
 .../fairseq/models/huggingface/hf_gpt2.py     |  4 -
 .../fairseq/models/lightconv.py               |  4 -
 .../fairseq/models/lightconv_lm.py            |  4 -
 .../fairseq/models/lstm.py                    |  4 -
 .../fairseq/models/lstm_lm.py                 |  4 -
 .../fairseq/models/masked_lm.py               |  4 -
 .../fairseq/models/model_utils.py             |  4 -
 .../models/multilingual_transformer.py        |  4 -
 .../fairseq/models/nat/__init__.py            |  4 -
 .../fairseq/models/nat/cmlm_transformer.py    |  4 -
 .../fairseq/models/nat/fairseq_nat_model.py   |  4 -
 .../models/nat/insertion_transformer.py       |  4 -
 ...iterative_nonautoregressive_transformer.py |  4 -
 .../models/nat/levenshtein_transformer.py     |  4 -
 .../fairseq/models/nat/levenshtein_utils.py   |  4 -
 .../fairseq/models/nat/nat_crf_transformer.py |  4 -
 .../models/nat/nonautoregressive_ensembles.py |  4 -
 .../nat/nonautoregressive_transformer.py      |  4 -
 .../fairseq/models/roberta/__init__.py        |  4 -
 .../fairseq/models/roberta/alignment_utils.py |  4 -
 .../fairseq/models/roberta/hub_interface.py   |  4 -
 .../fairseq/models/roberta/model.py           |  4 -
 .../fairseq/models/roberta/model_camembert.py |  4 -
 .../fairseq/models/roberta/model_gottbert.py  |  4 -
 .../fairseq/models/roberta/model_xlmr.py      |  4 -
 .../fairseq/models/speech_dlm/__init__.py     |  4 -
 .../models/speech_dlm/hub_interface.py        |  4 -
 .../speech_dlm/modules/speech_dlm_decoder.py  |  4 -
 .../modules/speech_dlm_decoder_layer.py       |  4 -
 .../speech_dlm/sequence_generator/__init__.py |  4 -
 .../sequence_generator/multichannel_search.py |  4 -
 .../multichannel_sequence_generator.py        |  4 -
 .../fairseq/models/speech_dlm/speech_dlm.py   |  4 -
 .../models/speech_to_speech/__init__.py       |  4 -
 .../speech_to_speech/modules/ctc_decoder.py   |  4 -
 .../modules/stacked_embedding.py              |  4 -
 .../modules/transformer_decoder_aug.py        |  4 -
 .../modules/transformer_encoder.py            |  4 -
 .../models/speech_to_speech/s2s_conformer.py  |  4 -
 .../s2s_conformer_translatotron2.py           |  4 -
 .../speech_to_speech/s2s_conformer_unity.py   |  4 -
 .../speech_to_speech/s2s_transformer.py       |  4 -
 .../fairseq/models/speech_to_text/__init__.py |  4 -
 .../models/speech_to_text/convtransformer.py  |  4 -
 .../models/speech_to_text/hub_interface.py    |  4 -
 .../modules/augmented_memory_attention.py     |  4 -
 .../speech_to_text/modules/convolution.py     |  4 -
 .../speech_to_text/multi_modality_model.py    |  4 -
 .../models/speech_to_text/s2t_conformer.py    |  4 -
 .../models/speech_to_text/s2t_transformer.py  |  4 -
 .../models/speech_to_text/xm_transformer.py   |  4 -
 .../speech_to_text/xm_transformer_unity.py    |  4 -
 .../fairseq/models/text_to_speech/__init__.py |  4 -
 .../models/text_to_speech/fastspeech2.py      |  4 -
 .../models/text_to_speech/hub_interface.py    |  4 -
 .../models/text_to_speech/tacotron2.py        |  4 -
 .../models/text_to_speech/tts_transformer.py  |  4 -
 .../fairseq/models/text_to_speech/vocoder.py  |  4 -
 .../models/transformer/transformer_base.py    |  4 -
 .../models/transformer/transformer_config.py  |  4 -
 .../models/transformer/transformer_decoder.py |  4 -
 .../transformer/transformer_decoder_aug.py    |  4 -
 .../models/transformer/transformer_encoder.py |  4 -
 .../models/transformer/transformer_legacy.py  |  4 -
 .../fairseq/models/transformer_align.py       |  4 -
 .../models/transformer_from_pretrained_xlm.py |  4 -
 .../fairseq/models/transformer_lm.py          |  4 -
 .../fairseq/models/transformer_ulm.py         |  4 -
 .../fairseq/models/wav2vec/__init__.py        |  4 -
 .../fairseq/models/wav2vec/utils.py           |  4 -
 .../fairseq/models/wav2vec/wav2vec.py         |  4 -
 .../fairseq/models/wav2vec/wav2vec2.py        |  4 -
 .../fairseq/models/wav2vec/wav2vec2_asr.py    |  4 -
 .../fairseq/models/wav2vec/wav2vec2_laser.py  |  4 -
 .../fairseq/models/xmod/__init__.py           |  4 -
 .../fairseq/models/xmod/hub_interface.py      |  4 -
 .../fairseq/models/xmod/model.py              |  4 -
 .../models/xmod/transformer_layer_xmod.py     |  4 -
 .../fairseq/modules/__init__.py               |  4 -
 .../fairseq/modules/adaptive_input.py         |  4 -
 .../fairseq/modules/adaptive_softmax.py       |  4 -
 .../fairseq/modules/base_layer.py             |  4 -
 .../fairseq/modules/beamable_mm.py            |  4 -
 .../modules/character_token_embedder.py       |  4 -
 .../fairseq/modules/checkpoint_activations.py |  4 -
 .../fairseq/modules/conformer_layer.py        |  4 -
 .../fairseq/modules/conv_tbc.py               |  4 -
 .../fairseq/modules/cross_entropy.py          |  4 -
 .../downsampled_multihead_attention.py        |  4 -
 .../fairseq/modules/dynamic_convolution.py    |  4 -
 .../fairseq/modules/dynamic_crf_layer.py      |  4 -
 .../modules/dynamicconv_layer/__init__.py     |  4 -
 .../dynamicconv_layer/cuda_function_gen.py    |  4 -
 .../dynamicconv_layer/dynamicconv_layer.py    |  4 -
 .../modules/dynamicconv_layer/setup.py        |  4 -
 .../fairseq/modules/fairseq_dropout.py        |  4 -
 .../fairseq/modules/fp32_batch_norm.py        |  4 -
 .../fairseq/modules/fp32_group_norm.py        |  4 -
 .../fairseq/modules/fp32_instance_norm.py     |  4 -
 .../fairseq/modules/gelu.py                   |  4 -
 .../fairseq/modules/grad_multiply.py          |  4 -
 .../modules/gumbel_vector_quantizer.py        |  4 -
 .../modules/kmeans_vector_quantizer.py        |  4 -
 .../fairseq/modules/layer_drop.py             |  4 -
 .../fairseq/modules/layer_norm.py             |  4 -
 .../modules/learned_positional_embedding.py   |  4 -
 .../modules/lightconv_layer/__init__.py       |  4 -
 .../lightconv_layer/cuda_function_gen.py      |  4 -
 .../lightconv_layer/lightconv_layer.py        |  4 -
 .../fairseq/modules/lightconv_layer/setup.py  |  4 -
 .../modules/lightweight_convolution.py        |  4 -
 .../fairseq/modules/linearized_convolution.py |  4 -
 .../fairseq/modules/location_attention.py     |  4 -
 .../fairseq/modules/lstm_cell_with_zoneout.py |  4 -
 .../fairseq/modules/multihead_attention.py    |  4 -
 .../fairseq/modules/positional_embedding.py   |  4 -
 .../fairseq/modules/positional_encoding.py    |  4 -
 .../fairseq/modules/quant_noise.py            |  4 -
 .../modules/quantization/pq/__init__.py       |  4 -
 .../fairseq/modules/quantization/pq/em.py     |  4 -
 .../quantization/pq/modules/__init__.py       |  4 -
 .../modules/quantization/pq/modules/qconv.py  |  4 -
 .../modules/quantization/pq/modules/qemb.py   |  4 -
 .../quantization/pq/modules/qlinear.py        |  4 -
 .../fairseq/modules/quantization/pq/pq.py     |  4 -
 .../fairseq/modules/quantization/pq/utils.py  |  4 -
 .../quantization/quantization_options.py      |  4 -
 .../modules/quantization/scalar/__init__.py   |  4 -
 .../quantization/scalar/modules/__init__.py   |  4 -
 .../quantization/scalar/modules/qact.py       |  4 -
 .../quantization/scalar/modules/qconv.py      |  4 -
 .../quantization/scalar/modules/qemb.py       |  4 -
 .../quantization/scalar/modules/qlinear.py    |  4 -
 .../modules/quantization/scalar/ops.py        |  4 -
 .../modules/quantization/scalar/utils.py      |  4 -
 .../fairseq/modules/same_pad.py               |  4 -
 .../fairseq/modules/scalar_bias.py            |  4 -
 .../sinusoidal_positional_embedding.py        |  4 -
 .../modules/sparse_multihead_attention.py     |  4 -
 .../sparse_transformer_sentence_encoder.py    |  4 -
 ...arse_transformer_sentence_encoder_layer.py |  4 -
 .../fairseq/modules/transformer_layer.py      |  4 -
 .../fairseq/modules/transformer_layer_aug.py  |  4 -
 .../modules/transformer_sentence_encoder.py   |  4 -
 .../transformer_sentence_encoder_layer.py     |  4 -
 .../fairseq/modules/transpose_last.py         |  4 -
 .../fairseq/modules/unfold.py                 |  4 -
 .../fairseq/modules/vggblock.py               |  4 -
 .../fairseq/nan_detector.py                   |  4 -
 .../fairseq/optim/__init__.py                 |  4 -
 .../fairseq/optim/adadelta.py                 |  4 -
 .../fairseq/optim/adafactor.py                |  4 -
 .../fairseq/optim/adagrad.py                  |  4 -
 .../fairseq/optim/adam.py                     |  4 -
 .../fairseq/optim/adamax.py                   |  4 -
 .../fairseq/optim/amp_optimizer.py            |  4 -
 .../fairseq/optim/bmuf.py                     |  4 -
 .../fairseq/optim/composite.py                |  4 -
 .../fairseq/optim/cpu_adam.py                 |  4 -
 .../fairseq/optim/dynamic_loss_scaler.py      |  4 -
 .../fairseq/optim/fairseq_optimizer.py        |  4 -
 .../fairseq/optim/fp16_optimizer.py           |  4 -
 .../fairseq/optim/fused_adam.py               |  4 -
 .../fairseq/optim/fused_lamb.py               |  4 -
 .../fairseq/optim/lr_scheduler/__init__.py    |  4 -
 .../optim/lr_scheduler/cosine_lr_scheduler.py |  4 -
 .../lr_scheduler/fairseq_lr_scheduler.py      |  4 -
 .../optim/lr_scheduler/fixed_schedule.py      |  4 -
 .../inverse_square_root_schedule.py           |  4 -
 .../optim/lr_scheduler/manual_lr_scheduler.py |  4 -
 .../optim/lr_scheduler/pass_through.py        |  4 -
 .../lr_scheduler/polynomial_decay_schedule.py |  4 -
 .../lr_scheduler/reduce_lr_on_plateau.py      |  4 -
 .../optim/lr_scheduler/step_lr_scheduler.py   |  4 -
 .../lr_scheduler/tri_stage_lr_scheduler.py    |  4 -
 .../lr_scheduler/triangular_lr_scheduler.py   |  4 -
 .../fairseq/optim/nag.py                      |  4 -
 .../fairseq/optim/sgd.py                      |  4 -
 .../fairseq/optim/shard.py                    |  4 -
 .../fairseq/options.py                        |  4 -
 .../fairseq/pdb.py                            |  4 -
 .../fairseq/quantization_utils.py             |  4 -
 .../fairseq/registry.py                       |  4 -
 .../fairseq/scoring/__init__.py               |  4 -
 .../fairseq/scoring/bertscore.py              |  4 -
 .../fairseq/scoring/bleu.py                   |  4 -
 .../fairseq/scoring/chrf.py                   |  4 -
 .../fairseq/scoring/meteor.py                 |  4 -
 .../fairseq/scoring/tokenizer.py              |  4 -
 .../fairseq/scoring/wer.py                    |  4 -
 .../fairseq/search.py                         |  4 -
 .../fairseq/sequence_generator.py             |  4 -
 .../fairseq/sequence_scorer.py                |  4 -
 .../fairseq/speech_generator.py               |  4 -
 .../fairseq/tasks/__init__.py                 |  4 -
 .../fairseq/tasks/cross_lingual_lm.py         |  4 -
 .../fairseq/tasks/denoising.py                |  4 -
 .../fairseq/tasks/fairseq_task.py             |  4 -
 .../fairseq/tasks/frm_text_to_speech.py       |  4 -
 .../fairseq/tasks/language_modeling.py        |  4 -
 .../fairseq/tasks/legacy_masked_lm.py         |  4 -
 .../fairseq/tasks/masked_lm.py                |  4 -
 .../fairseq/tasks/multilingual_denoising.py   |  4 -
 .../tasks/multilingual_language_modeling.py   |  4 -
 .../fairseq/tasks/multilingual_masked_lm.py   |  4 -
 .../fairseq/tasks/multilingual_translation.py |  4 -
 .../fairseq/tasks/online_backtranslation.py   |  4 -
 .../tasks/semisupervised_translation.py       |  4 -
 .../fairseq/tasks/sentence_prediction.py      |  4 -
 .../tasks/sentence_prediction_adapters.py     |  4 -
 .../fairseq/tasks/sentence_ranking.py         |  4 -
 .../fairseq/tasks/simultaneous_translation.py |  4 -
 .../fairseq/tasks/span_masked_lm.py           |  4 -
 .../fairseq/tasks/speech_dlm_task.py          |  4 -
 .../fairseq/tasks/speech_to_speech.py         |  4 -
 .../fairseq/tasks/speech_to_text.py           |  4 -
 .../fairseq/tasks/text_to_speech.py           |  4 -
 .../fairseq/tasks/translation.py              |  4 -
 .../tasks/translation_from_pretrained_bart.py |  4 -
 .../tasks/translation_from_pretrained_xlm.py  |  4 -
 .../fairseq/tasks/translation_lev.py          |  4 -
 .../tasks/translation_multi_simple_epoch.py   |  4 -
 .../fairseq/token_generation_constraints.py   |  4 -
 .../fairseq/tokenizer.py                      |  4 -
 .../fairseq/trainer.py                        |  4 -
 .../fairseq/utils.py                          |  4 -
 .../fairseq_cli/eval_lm.py                    |  4 -
 .../fairseq_cli/generate.py                   |  4 -
 .../fairseq_cli/hydra_train.py                |  4 -
 .../fairseq_cli/hydra_validate.py             |  4 -
 .../fairseq_cli/interactive.py                |  4 -
 .../fairseq_cli/preprocess.py                 |  4 -
 .../fairseq_cli/score.py                      |  4 -
 .../fairseq_cli/train.py                      |  4 -
 .../fairseq_cli/validate.py                   |  4 -
 .../hubconf.py                                |  4 -
 .../scripts/average_checkpoints.py            |  4 -
 .../scripts/build_sym_alignment.py            |  4 -
 .../scripts/constraints/extract.py            |  4 -
 .../scripts/constraints/validate.py           |  4 -
 .../scripts/count_docs.py                     |  4 -
 .../scripts/read_binarized.py                 |  4 -
 .../scripts/rm_pt.py                          |  4 -
 .../scripts/shard_docs.py                     |  4 -
 .../scripts/split_train_valid_docs.py         |  4 -
 .../setup.py                                  |  4 -
 .../train.py                                  |  4 -
 424 files changed, 1836 deletions(-)
 delete mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile
 delete mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py
 delete mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf
 delete mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile
deleted file mode 100644
index c2f5b1a89c..0000000000
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = python -msphinx
-SPHINXPROJ    = fairseq
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py
deleted file mode 100644
index 0bc049f802..0000000000
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/conf.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-# fairseq documentation build configuration file, created by
-# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import os
-import sys
-from fairseq import __version__
-
-
-# source code directory, relative to this file, for sphinx-autobuild
-sys.path.insert(0, os.path.abspath(".."))
-
-source_suffix = [".rst"]
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.viewcode",
-    "sphinx.ext.napoleon",
-    "sphinxarg.ext",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The master toctree document.
-master_doc = "index"
-
-# General information about the project.
-project = "fairseq"
-copyright = "Facebook AI Research (FAIR)"
-author = "Facebook AI Research (FAIR)"
-
-github_doc_root = "https://github.com/pytorch/fairseq/tree/main/docs/"
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = __version__
-# The full version, including alpha/beta/rc tags.
-release = __version__
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-highlight_language = "python"
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-html_theme = "classic"
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
-    "python": ("https://docs.python.org/", None),
-    "torch": ("https://pytorch.org/docs/master/", None),
-}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf
deleted file mode 100644
index 526acffd32..0000000000
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/docutils.conf
+++ /dev/null
@@ -1,2 +0,0 @@
-[writers]
-option-limit=0
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat
deleted file mode 100644
index baa9d02a79..0000000000
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/docs/make.bat
+++ /dev/null
@@ -1,36 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=python -msphinx
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-set SPHINXPROJ=fairseq
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
-	echo.then set the SPHINXBUILD environment variable to point to the full
-	echo.path of the 'sphinx-build' executable. Alternatively you may add the
-	echo.Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-
-:end
-popd
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
index 822ff0aa50..ee4fd20304 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
index 0317d5c623..30d5d51bbd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 # import models/tasks to register them
 from . import dummy_dataset, dummy_lm, dummy_masked_lm, dummy_model, dummy_mt  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
index a44847f250..6553ba4aed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import itertools
 import random
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
index c6246a0c0e..2f2336b2ce 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
index 12b9c5d0f5..7571a86811 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
index ff26e4fe65..0b2325aa16 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
index 28d78cffdb..d77e1deeda 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
index 6f03d7a2cb..c148402cc0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
index fb9a6679ba..4f492ce143 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import ast
 import collections
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
index 6264236915..e69de29bb2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
index ecd65d34ad..1f4baec525 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
index fc1ac85404..50d4c997a6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
index 98e835fa6e..be8a1c89c8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq import utils
 from fairseq.criterions import LegacyFairseqCriterion, register_criterion
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
index 24d6bcd612..bbb0424aed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
index 0b1e64a8e3..eed86feafa 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import inspect
 from typing import Any, Dict, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
index 262874b582..b99f9c8e01 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 import re
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
index 325679bb16..a5477e0975 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
index 6eaedab9cf..459d435bf8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
index b55f65e5cc..df1e1953a2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
index f2e8cdf3bf..92935d9c80 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
index 47ee263a8d..819250d5f3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
index 5cf70df2ab..8e54d0feb3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
index 09ddd9f3e6..fedba8eaed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
index 4c020ddbd2..ebdf81f643 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
index fc0bdaf851..3b1c38ee65 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
index 298b805768..837439a646 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
index 8a873a45b3..9d2c65e6fb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
index bfb9f058f9..8b0898422c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
index 8888180114..6c7bc0e250 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
index 06a8252140..a9438f6b6c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
index eea74bae26..8a9ef6ac27 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
index 3975468487..794a39184b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
index eeaae2b254..98726903ad 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 from .dictionary import Dictionary, TruncatedDictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
index bf89f25656..4f9c9972e5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
index 978a5b1903..60419466b1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
index 87695bd0f5..026e4811ce 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
index 590a7493ae..121cca60c5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import mmap
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
index 6be6f6521c..34595cc1bc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from argparse import Namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
index f09b065fdc..c3c4a4eae5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
index edb307e68f..52f8cfd7fd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
index fe4b61f831..d9bbb76ec8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
index cdf71558fd..d2a71d16fa 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import csv
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
index 06922ea083..7b8145ac48 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
index 8f70c90df3..dc386c9b4a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from fairseq import utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
index 134d398b47..414ba0b6c9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from torch.utils.data.dataloader import default_collate
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
index 0f94100148..014fc284a6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
index a433091956..77618c5796 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import json
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
index 7a6d271379..e88ce9bada 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
index 01a4078bb1..105506864c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import bisect
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
index 625a29370e..2bb37b018e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
index 9a19cc3c18..d156d7b455 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 try:
     from collections.abc import Iterable
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
index c61f31d6b2..16e5bae3e0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
@@ -1,8 +1,4 @@
 # cython: language_level=3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
index a900fc6f96..1c83436d46 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
index d6495389f0..e6101dbd17 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import os
 from collections import Counter
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
index 7cbe00a105..837c1ba299 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
index 31e3a06278..4b7c80ee5a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
index a305c08092..1ea23b0db9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import re
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
index f88f8f6929..2ce946895f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from fairseq.data.encoders import register_bpe
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
index 494ea21939..77a1a7885d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from fairseq.data.encoders import register_bpe
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
index f7c2103954..36bf93e423 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
index e661426a73..10327002f7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
index a41c059343..2e7c7bba84 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
index c508578d41..eaa46b9373 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
index e236dad167..7df4654fcd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
index 0ab92377b3..44ad4e3677 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.data.encoders import register_tokenizer
 from fairseq.dataclass import FairseqDataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
index 0aa6cd7681..48d310ac9d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
index 925ad41b7c..24bc3cea75 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import re
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
index 5d724d2730..7b04ebe537 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
index d93eb532ef..d108773aeb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from fairseq.data import encoders
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
index 2bde7fc57b..3b731c622a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import numpy as np
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
index 007011974a..d07b42c7e2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import os
 import subprocess
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
index 9b61fafadb..61b339e1c2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .huffman_coder import HuffmanCodeBuilder, HuffmanCoder
 from .huffman_mmap_indexed_dataset import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
index c04f84564e..9a4989e543 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import re
 import typing as tp
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
index 9b098f2c2b..7d1a58912c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import mmap
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
index 3e4d7969cf..85387c6ed2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
index 1947d99408..d938888b5d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import shutil
 import struct
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
index a488265137..f7f8ae1b99 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
index fd356ddd04..cae8ccd16c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
index 9bd5c72b5e..19d47fb207 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .block_pair_dataset import BlockPairDataset
 from .masked_lm_dataset import MaskedLMDataset
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
index ba069b4605..221311f4c1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
index dd8ea2c60a..f4c895eebd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Dict, List, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
index dee88f7a3e..635ce3866b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.data import Dictionary
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
index 12f00aa436..0f3db5b4d7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
index 1a945927cf..190b89e320 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
index a7854ac170..d187afa1ca 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from functools import lru_cache
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
index 0ca9051c9a..3852277b72 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from functools import lru_cache
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
index 54fd583b64..3df617bc5e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
index 6f2fe074b2..af1830a17c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import asyncio
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
index e2e9fdf004..d601881c82 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 from typing import Callable, Dict, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
index 6264236915..e69de29bb2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
index 876dfcec36..75f041b419 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import itertools
 import json
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
index ece9a9721e..e378ca578f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import datetime
 import hashlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
index bb187a8dc2..33538ae7c3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import hashlib
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
index 140c68f01d..7fcaddf751 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
index 52e74abdda..97a919878b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
index e92e83c2cd..5ed11cf83f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
index 99a17495c7..da6d29fa2c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import FairseqDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
index ac86dfd2f1..35b27d9261 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
index 6fabbdcdaa..559e98d998 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
index b512d370f9..705033977d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.data import data_utils
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
index d7f7b88dbb..e4aaa80743 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
index 459fb8acd7..ba492e2a18 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import hashlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
index ad74784d2d..efe919fee9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
index fd1331f4c4..587edd6726 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
index d054904f41..58632ee3e3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
index 5aac2ba96b..1b33cc29d2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
index 2d77ed79d7..bc1056503e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
index a2915eeb3e..46604a0351 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
index 2cb7447ea9..845d46b5fd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from collections import OrderedDict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
index 6ebb5d88fe..f0e8dc2798 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 from fairseq.data import data_utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
index b3890e7279..3000cadefd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
index 72189bd378..4b9fc93d0b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
index 06c4808f0a..e1816066d8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
index cae39ba4d2..88457b846b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
index fe5c7e2ac8..edb3a1bfc5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import contextlib
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
index d699f2ea29..e48c802941 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from enum import Enum
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
index a414e7ef64..7a92af4267 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
index 08af4f3061..4b23c4022c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
@@ -1,8 +1,4 @@
 # cython: language_level=3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
index effa127d50..aa1f4bc3ef 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
index fb14ff018e..a0acc0f9b5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
index d8b2109014..561696dcc3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
index 25408d28ec..370c944db4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .configs import FairseqDataclass
 from .constants import ChoiceEnum
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
index af957fec64..8e4fbfa12e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import os
 import sys
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
index 5af92f2b3a..d7844a36f9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from enum import Enum, EnumMeta
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
index 5a7784bad1..646225ae42 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
index f6467d5f40..7c8ce075a4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import ast
 import inspect
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
index 9130db8f5d..b51cb7ab5b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .distributed_timeout_wrapper import DistributedTimeoutWrapper
 from .fully_sharded_data_parallel import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
index 6e06b4b6dd..dc84d60db7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
index 1c508b05dd..731e740f77 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import contextlib
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
index cd434c7372..715cdf5334 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 A modified version of the legacy DistributedDataParallel module that uses c10d
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
index 904dc0c202..950c1e6469 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from torch import nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
index 3b9e103301..2d39ca583b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
index 968830d585..9afe8d1ece 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import io
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
index 3f27549099..c83f42f0f5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import os
 import typing as tp
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
index 8eca70a066..468c281c82 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
@@ -1,9 +1,5 @@
 #!/usr/bin/env python3
 
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
index b99da2e8cd..9204fbd02c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 Utilities for working with the local dataset cache.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
index b0c2da15bf..bb9b730a77 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import argparse
 import copy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
index b26e6cd01c..f1c2b0cce4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import uuid
 from typing import Dict, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
index 3d32c6bf4d..99edc43447 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import namedtuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
index 495bd08300..41372dcc0d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import bisect
 import time
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
index 49301f27f8..d16e13b4bf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 A standalone module for aggregating metrics.
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
index 4c64b61bad..b9d2916ea8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 Wrapper around various loggers and progress bars (e.g., tqdm).
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
index 69f2168487..4811d61956 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import criterions, models, modules  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
index 5fae7bd4c2..8cd368c35d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
index 5ffbaa8764..ef15075284 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
index aedf608bce..b7c6acb6f6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 Train a network across multiple GPUs.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
index 3532479e52..99de782fac 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
index 117827c3e9..71785a3745 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
index 85dbd44b3c..47f3accd60 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from collections import namedtuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
index 7bb0c9ada8..29d1ed0f13 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
index cf3b2e8baf..942c45eaac 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
index 03e4dbe263..2fb56d312e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
index 11603217a1..27839ae72d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 from .multihead_attention import ModelParallelMultiheadAttention
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
index bbea450950..c71156071f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
index 7ab53c6e5f..62e8a8581f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
 from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
index 11cf6ee530..06e22aa403 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
index a701923f7e..1c9ce80f68 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .hub_interface import *  # noqa
 from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
index 6b647c9642..4c09efbaca 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
index e3670c0a2c..20635b5c6e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 BART: Denoising Sequence-to-Sequence Pre-training for
 Natural Language Generation, Translation, and Comprehension
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
index 4e20fe3a83..6354049675 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .fairseq_encoder import FairseqEncoder
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
index fd76bcd4bf..921656ac27 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
index 503ceaa609..9257a1a3c1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
index 13b73d639e..4328ca8689 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
index 08cbde15a4..e1e3d0c948 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, NamedTuple, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
index cc72a0f8f3..bac6d8a5a2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from typing import Dict, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
index 65ead9dcf2..3682e633f8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Base classes for various fairseq models.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
index c99a215101..a0b97cfc20 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
index 4b243d6669..cb2db7a6de 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq import utils
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
index 8357ef7847..59be5a9bb5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
index a1b0eabbdb..7f04042e5f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .hubert import *  # noqa
 from .hubert_asr import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
index 0c0349eb2f..757a897eff 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
index 11c85ce7d1..4328c11fd6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import contextlib
 import copy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
index f7911c2c8e..7416ecaba3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
index 3a8eb78198..25c47c811b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
index 7950280e30..fd26bac53c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
index 1d9efc4e42..0f27805e09 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq import utils
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
index 8a29156270..82d7a55efb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
index 454f0ac36f..5dbfa472fd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq import utils
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
index 5cb49dd77c..a357ed8245 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
index 732d66b1d5..6d2807a26e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
index e722b647ed..ce03b68e71 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import OrderedDict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
index 05fe822487..e7788467c5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 from .fairseq_nat_model import *
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
index c876e9453c..784a834260 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 This file implements:
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
index a5594a4ed9..b03a77f952 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
index bc28000f59..f8c83e1b89 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
index bc39509980..037c4d44d8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from fairseq.models import register_model, register_model_architecture
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
index d60d3c52d5..aee7669fcb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
index 375a98c2e1..660e3dc6a1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from fairseq.utils import new_arange
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
index d4b3cd931c..c255baebb0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from fairseq.models import register_model, register_model_architecture
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
index 0a0221f9c4..fa1510e7d1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
index d114202d25..c0b519c463 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
index 4cd723ae96..7ce4f3cfe8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .hub_interface import *  # noqa
 from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
index ccc7f74cb9..8b3d56f652 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import Counter
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
index ba298d63ba..618ef5247c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
index d7ced9190c..9284634168 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 RoBERTa: A Robustly Optimized BERT Pretraining Approach.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
index 46447546fa..be645eeca6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 CamemBERT: a Tasty French Language Model
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
index dc7a019b33..a5c6bbd09a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 GottBERT: a pure German Language Model
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
index cf6e354d53..aa6a86d084 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Unsupervised Cross-lingual Representation Learning at Scale
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
index 6ea914d6a5..6613b1dbe0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .speech_dlm import *  # noqa
 from .hub_interface import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
index 11bc0f50bb..76a449680a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
index a14a1d64a8..9c43a7c56c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
index fb65fdf810..8442bc6e78 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, Tuple, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
index a88e144577..0f03a0c9d0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .multichannel_sequence_generator import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
index db4b77f345..edbe182c2f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
index 24807b866d..94c2d83b5c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Dict, List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
index dc13f565f1..7a3c8186fb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
index f29215c2fe..f302e4b658 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .s2s_conformer import *  # noqa
 from .s2s_conformer_translatotron2 import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
index 721efbf61a..057a104e90 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from torch import nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
index 5955a08538..b7cfabf371 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
index 68f42c2b36..923d79f1df 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Any, Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
index fb1af433d8..79c11f6b1d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
index 636396d536..47eb65a74a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
index 8016daee8d..42c320bdea 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
index 64388d6d16..bf904ff106 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
index 07393d2598..534c1df73b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
index 62ef663efb..0e6b7da038 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .berard import *  # noqa
 from .convtransformer import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
index 4d0fc02aee..e5945bcad3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
index d78427f687..c4d683b2cf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from argparse import Namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
index 2d330f96f6..ec95a6d0a4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import List, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
index 526d7540c5..0739a19d18 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
index 046421620a..e40f818571 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.models import FairseqDecoder, FairseqEncoder
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
index 79dbbec1b9..4b57ed110f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
index 50fae2ffa2..6d16d47d08 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
index 7b4b234641..ce780faca5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
index f77ef4e570..0f05b5ee80 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
index c0dcd69b07..ad3cf31d50 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .tacotron2 import *  # noqa
 from .tts_transformer import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
index fb2d0df37d..c3117829c3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
index e251c65c1d..d5f3d117e0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import random
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
index 4df4075617..9f0876c20c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
index 19afc2b717..d7541068f4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from typing import List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
index dbc02da368..173e1a9415 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import json
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
index f9f097f04b..a10c8bd0a9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
index 4650de2e17..1fef3e6ebc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import re
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
index c22e5625d4..97e0c44dc6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Any, Dict, List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
index c5e7101794..15816a0e4f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Any, Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
index 17369ec8dc..c738a6ddea 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Dict, List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
index 00d14a7dde..955fb77abd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.dataclass.utils import gen_parser_from_dataclass
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
index eaf585bd10..e93aa2e46c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.transformer import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
index 236d9942e1..6a679ebccd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import os
 from typing import Any, Dict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
index 1e3aa72d38..e0ae91d8f4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
index 0fc9ae4348..f4536bce80 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
index bacbb7a95d..315081a63c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .wav2vec import *  # noqa
 from .wav2vec2 import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
index dd52d86242..b2f3730ff2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
index af6604da10..65e3f65ba7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
index 8214bc8e7a..8acbd75f8b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
index bf261624e6..aeda70e6f9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import contextlib
 import copy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
index ff89759d38..95febcdaec 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.models import BaseFairseqModel, register_model
 from fairseq.models.wav2vec.wav2vec2_asr import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
index bbf7694920..c36afbb194 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .model import *  # noqa
 from .transformer_layer_xmod import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
index 909bb423ca..ca460f8ee9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from fairseq.models.roberta.hub_interface import RobertaHubInterface
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
index fb6c7a8dea..585af7e3ec 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from ..roberta.model_xlmr import XLMRModel
 from fairseq.models.xmod.transformer_layer_xmod import XMODTransformerEncoderLayerBase
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
index 47a91cdc23..bdacdaf1c2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.modules.transformer_layer import TransformerEncoderLayer
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
index dcfda9b82a..a784b18d2d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 from .adaptive_input import AdaptiveInput
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
index 01ac4accac..fad619c529 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
index ae0c77ba0f..02fedf8632 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import functools
 import operator
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
index e823f7bae2..fb70a72013 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
index eff1a4607f..73adf6497e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
index 181221b61b..dc686c8cf5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from typing import List, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
index aa0b5929a3..ec48a0dbff 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import functools
 from typing import Any, Dict, List, Tuple, Union
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
index 964af243ec..3cf75bad3f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
index 65e17ec94f..bafcc41fd1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
index 286c00eecc..508a0820e8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
index 5e42942a9f..5a478e9a21 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 #
 
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
index 0ff02cd62a..1034f1f546 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
index 8fcc6b8d26..d6d7cd5b8e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 This file is to re-implemented the low-rank and beam approximation of CRF layer
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
index 22dc6f403d..57bfe55a90 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .dynamicconv_layer import DynamicconvLayer  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
index 9304f99eb8..387f8e1266 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 def gen_forward():
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
index 711ed03483..141cfc18b6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import dynamicconv_cuda
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
index 6a21f7e2ee..ffb2d071de 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
index 3cddca7718..b5c5a698fa 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from typing import List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
index c560f338fd..7c34652947 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 batch norm done in fp32 (for fp16 training)
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
index d03aac022e..0f2306ab3e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Layer norm done in fp32 (for fp16 training)
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
index 30a54496de..cbbdb9f338 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Layer norm done in fp32 (for fp16 training)
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
index a2f1ecff4a..9d2ef16c68 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
 the corresponding GitHub repo: https://github.com/hendrycks/GELUs
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
index 08d15f55df..592a9cede7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
index 867b019f67..8869cb2339 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
index 1015c38999..5e59f31648 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
index 8961d8bcbc..b25b3f494d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 LayerDrop as described in https://arxiv.org/abs/1909.11556.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
index 0b276ce02f..1e9d63d562 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
index 378d0f7071..c96eca6402 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
index 3b2a99c122..b207235666 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .lightconv_layer import LightconvLayer  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
index a25433dd8e..3e3d2d9383 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 def gen_forward():
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
index e7e597f474..202ba55b48 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import lightconv_cuda
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
index 052635be79..b236a12935 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
index ec11a95079..4ce73e7dd5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
index 1c7a9f09ac..ef69db2afc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
index dbbbfb9f2d..1b59c68a50 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
index 273308951f..53220737f3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
index 262132dfe7..e6c99cf35d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Dict, List, Optional, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
index 97cd474b51..f12cb9eef3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
index 67f6353539..7d3a30fa93 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
index d777dfbb6c..c5ce383931 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
index c142a802e0..90e4f2beae 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .utils import SizeTracker, get_param, attrsetter, quantize_model_  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
index 6f15c3e46b..190db6c933 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
index b67c8e8ad6..44f8121900 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .qconv import PQConv2d  # NOQA
 from .qemb import PQEmbedding  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
index d15ec192e8..3687f2c7d4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
index 3a74ad3c4c..91ac168673 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
index 9bdd25a868..5b4753d4f8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
index eddc2eb346..ef7b237187 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .em import EM, EmptyClusterResolveError
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
index eceeef8ba3..7983cbf4c0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import re
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
index b46d682c0e..7b5effef20 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 def parse_config_yaml(yaml_data):
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
index 143834f3d0..e66f535e9c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
@@ -1,6 +1,2 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .utils import quantize_model_  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
index 8031d9cdb2..bea430328a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from .qact import ActivationQuantizer  # NOQA
 from .qconv import IntConv2d  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
index b362c30dc7..38aa5a9677 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
index 29744744ec..7ae37cc551 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
index 3b293ac31e..bc63747729 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
index 78606a25b9..599aa61639 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
index e0f9a0c1f8..7bfae4bf36 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
index d4b1cc255b..2c0c38f0a6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from operator import attrgetter
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
index a3ce4131c6..8a9eeb69fe 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
index c96247c759..ee350ff283 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 #
 
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
index 4793ecfb52..f072e0e875 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from typing import Any, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
index 3cbd9d6785..da4175f01c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
index f41ec09327..c9b7cdf192 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn as nn
 from fairseq.modules import TransformerSentenceEncoder
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
index d95da59c24..b3a4cfb6e4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.modules import TransformerSentenceEncoderLayer
 from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
index 19e035dec5..ccb20da6e5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
index 7eb816978a..87b0752644 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
index 5d2db91ad7..cb2ae64b9a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
index f869c4b2f8..10a4a58a20 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Callable, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
index d7cca9a4bb..1f0380fbd2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 transpose last 2 dimensions of the input
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
index bbaafbd6bf..67dab04396 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.nn.functional as F
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
index ee5ee19a34..f10d9b20d7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
index bd0f911073..28b2129f74 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
index be783be896..9bfd3dabb8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
index f1a2154977..bb33c73656 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.optim
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
index 042ae926b0..b6f0f494bd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
index 4f539541c1..0f609b798c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.optim
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
index 678ec7c617..eac9fbf84a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
index 98ff8ad7ad..f8420c24cd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 import torch.optim
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
index cfe57d07f9..d2c274a957 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
index d6d0e04e86..2187e613a6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
index 1ef0114ed6..fb8e16044a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from collections import defaultdict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
index b218934e71..dfb70c94e3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import importlib
 from collections.abc import Collection
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
index 60c47b8db0..81bee4e264 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 class DynamicLossScaler(object):
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
index 73c7c695ee..8690ef98e0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from fairseq import utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
index 6a4da342ca..db16fc4b47 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections import defaultdict
 from itertools import chain
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
index 39a2a83694..4ac74811a2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import types
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
index f4f2bdb0c6..7fae6939d3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from fairseq.optim import LegacyFairseqOptimizer, register_optimizer
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
index 5b3dbc023a..089a251457 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
index 5fcaea25d4..73bbb9ed67 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from collections.abc import Collection
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
index 6c12fa56b8..c0571d8618 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from argparse import Namespace
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
index d0e7e14b7e..da801ae6cd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 from typing import Optional, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
index 987c905a23..cf987f656c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections.abc import Collection
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
index 57edc256fd..8bdfc64aad 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from . import LegacyFairseqLRScheduler, register_lr_scheduler
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
index 2f93db328c..49053811e5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
index b8109a7c1e..83134915dd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 from typing import Optional, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
index 5ee9c1be4a..2c92d189ef 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
index db99d4eee8..7ca71d270d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections.abc import Collection
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
index 4d5547c39b..0c9f77a231 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
index 2a32bd10f2..167daebcb9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
index c30a6c0fb1..d28b4b8c28 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from collections.abc import Collection
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
index 8e34fb99a1..310d9e35a9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch.optim
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
index 9d7f2eb9e5..23b3ef5e06 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from typing import Any, Dict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
index 920591635a..72e41dc82e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import argparse
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
index 1ba6ef0d33..46ab8ff74d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import multiprocessing
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
index 11fc414c85..4eceba59cb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
index 904ffcd602..257cdb6baf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from argparse import Namespace
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
index 58f2f563e4..e22fea534d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
index 6d5a8450d3..eb9ee211d5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
index e55bd2f393..5670814393 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import ctypes
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
index 5df5a1c011..f583573b13 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
index 32719956fe..3d29e9fe04 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
index b0cedd5099..ffd1fbaf39 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import unicodedata
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
index 633dc47c24..de315d7425 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
index c7378bbb51..53f426fcb5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
index 78db504e6c..7a2be389ff 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import math
 import sys
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
index 411d4df444..569da0ef3b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import sys
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
index f2cc8b5e86..b78306fb06 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
index 6da1f001f0..41e923b127 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
index 8f8fe7e2de..8a3731f177 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
index 57b824d581..7d75160f08 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
index e39d1d6848..160b442097 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
index 667f5f8ee4..6df8ab9c77 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
index 44d5324b3d..f529f7f317 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
index 9754976549..1e30303a26 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
index b064907a50..d6c5dbe7a0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
index cb5ee34554..de177fe53c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 import logging
 import os
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
index 8fd5e5954d..61b375cb7c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
index 156d085aa4..1f63687366 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
index cef7656691..3461fa5630 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import contextlib
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
index da24fe8981..8cb92e99a4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import contextlib
 import json
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
index 432b8a52ca..da20e781d0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
index de80addaf2..9fc8e8c1a9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
index afe5569626..1fa4ce114c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
index 57f63aab67..83bdfc2246 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
index 9576b26801..10ce4be313 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from fairseq.tasks import register_task
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
index d746aa154c..cef59ac4ec 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
index 340732b928..aebcda366c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
index 5aaaa95a90..ea4cd5b9c7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import json
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
index 884082112a..24287ce001 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 from argparse import Namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
index 82e7e6643a..5bcd78ea9e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
index 6897ebe116..5391e6fe12 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 import itertools
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
index 0fd7a5b29f..679f875813 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import torch
 from fairseq import utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
index a05f289152..202af52cc3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
 from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
index b45fecd1f4..516bc21c59 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass, field
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
index 5db36a7c79..86ed5a7694 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import datetime
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
index e708dc51bc..dffd4d1769 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """Implements tracking of constraints for a beam item.
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
index 42131f7b1d..d57cbb64c9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import re
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
index 16b1b91697..20a618870a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 Train a network across multiple GPUs.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
index 4d4b350523..525ca13c0c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import argparse
 import collections
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
index dbd1450a9e..9c2b93659f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """
 Evaluate the perplexity of a trained language model.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
index b8757835d4..aa7077e984 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Translate pre-processed data with a trained model.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
index 607340af0d..557533650c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
index cb6f7612d0..a4ec0a836b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
index 03265d00e8..0d96d8b65e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Translate raw text with a trained model. Batches data on-the-fly.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
index 2ba9e09338..06d54a094f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Data pre-processing: build vocabularies and binarize training data.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
index 0b207be959..f74f7c30af 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 BLEU scoring of generated translations against reference translations.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
index f771bff654..5939712050 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Train a new model on one or across multiple GPUs.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
index 4617b6d542..591f356195 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
index 5949e274ed..3a146356ed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
 
 import functools
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
index 49f4f9d912..26a8c42fd4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import argparse
 import collections
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
index 0ca5c18f7b..5f237ea210 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
@@ -1,7 +1,3 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Use this script in order to build symmetric alignments for your translation
 dataset.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
index 437b373856..39907e163d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
@@ -1,9 +1,5 @@
 #!/usr/bin/env python3
 #
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 """Extracts random constraints from reference files."""
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
index d531ad9f39..03ac4b9920 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
@@ -1,9 +1,5 @@
 #!/usr/bin/env python3
 #
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import sys
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
index 58d85af85e..e43b58996d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Count the number of documents and average number of lines and tokens per
 document in a large file. Documents should be separated by a single empty line.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
index a414095d03..876d517629 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import argparse
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
index 6cd063d21f..33226c34dd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import argparse
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
index 97232c3c84..1db23b58b9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Split a large file into shards while respecting document boundaries. Documents
 should be separated by a single empty line.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
index ff15978528..709b7eaa60 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Split a large file into a train and valid set while respecting document
 boundaries. Documents should be separated by a single empty line.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
index dae06080c5..9c895f74c5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 
 import os
 import subprocess
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
index 321de3d9b5..70506ef4dc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
@@ -1,8 +1,4 @@
 #!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 """
 Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead.
 """
-- 
Gitee


From 9a878a8aeb611a9da9869ca011995ec22bed30ed Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Fri, 9 Jun 2023 17:45:44 +0800
Subject: [PATCH 03/12] trans + optim

---
 .../fairseq/optim/adam.py                     | 378 ++++++++++++++----
 .../fairseq/optim/fairseq_optimizer.py        |  15 +-
 .../fairseq/sequence_generator.py             |   5 +-
 .../fairseq/trainer.py                        |   2 +-
 .../fairseq/utils.py                          |   5 +-
 .../fairseq_cli/train.py                      |   5 +-
 6 files changed, 303 insertions(+), 107 deletions(-)

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
index eac9fbf84a..d3f711c938 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
@@ -1,6 +1,7 @@
 
 import logging
 import math
+from collections import defaultdict
 from collections.abc import Collection
 from dataclasses import dataclass, field
 from typing import Any, List
@@ -8,12 +9,12 @@ from typing import Any, List
 import torch
 import torch.distributed as dist
 import torch.optim
+from torch_npu.utils import npu_combine_tensors
 from fairseq.dataclass import FairseqDataclass
 from fairseq.optim import FairseqOptimizer, register_optimizer
 from fairseq.optim.fused_adam import get_fused_adam_class
 from omegaconf import II, OmegaConf
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -50,9 +51,9 @@ class FairseqAdam(FairseqOptimizer):
         super().__init__(cfg)
         fused_adam_cls = get_fused_adam_class()
         use_fused_adam = (
-            not getattr(cfg, "use_old_adam", False)
-            and fused_adam_cls is not None
-            and torch.cuda.is_available()
+                not getattr(cfg, "use_old_adam", False)
+                and fused_adam_cls is not None
+                and torch.cuda.is_available()
         )
         if getattr(cfg, "tpu", False):
             if self.cfg.fp16_adam_stats:
@@ -93,14 +94,15 @@ class FairseqAdam(FairseqOptimizer):
 
     def average_params(self):
         """Reduce Params is only used during BMUF distributed training."""
-        state_dict = self.optimizer.state_dict()
-        total_gpus = float(dist.get_world_size())
 
-        for _, value in state_dict["state"].items():
-            value["exp_avg"] /= total_gpus
-            value["exp_avg_sq"] /= total_gpus
-            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
-            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
+    state_dict = self.optimizer.state_dict()
+    total_gpus = float(dist.get_world_size())
+
+    for _, value in state_dict["state"].items():
+        value["exp_avg"] /= total_gpus
+        value["exp_avg_sq"] /= total_gpus
+        dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
+        dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
 
 
 class Adam(torch.optim.Optimizer):
@@ -131,19 +133,23 @@ class Adam(torch.optim.Optimizer):
     """
 
     def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        weight_decay=0,
-        amsgrad=False,
+            self,
+            params,
+            lr=1e-3,
+            betas=(0.9, 0.999),
+            eps=1e-8,
+            weight_decay=0,
+            amsgrad=False,
     ):
         defaults = dict(
             lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad
         )
         super(Adam, self).__init__(params, defaults)
 
+        self._combined_state_indexed_by_group = None
+        self._combined_params_indexed_by_group = None
+        self._combined_grads_indexed_by_group = None
+
     @property
     def supports_memory_efficient_fp16(self):
         return True
@@ -152,6 +158,7 @@ class Adam(torch.optim.Optimizer):
     def supports_flat_params(self):
         return True
 
+    @torch.no_grad()
     def step(self, closure=None):
         """Performs a single optimization step.
 
@@ -163,73 +170,272 @@ class Adam(torch.optim.Optimizer):
         if closure is not None:
             loss = closure()
 
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.dtype in {torch.float16, torch.bfloat16}:
-                    grad = grad.float()
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        "Adam does not support sparse gradients, please consider SparseAdam instead"
-                    )
-                amsgrad = group.get("amsgrad", False)
-
-                p_data_fp32 = p.data
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p_data_fp32 = p_data_fp32.float()
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32)
-                else:
-                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
-                    state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
-                    if amsgrad:
-                        state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to(
-                            p_data_fp32
-                        )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                if amsgrad:
-                    max_exp_avg_sq = state["max_exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                if amsgrad:
-                    # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
-                    # Use the max. for normalizing running avg. of gradient
-                    denom = max_exp_avg_sq.sqrt().add_(group["eps"])
-                else:
-                    denom = exp_avg_sq.sqrt().add_(group["eps"])
-
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-                step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
-
-                if group["weight_decay"] != 0:
-                    p_data_fp32.add_(
-                        p_data_fp32, alpha=-group["weight_decay"] * group["lr"]
-                    )
-
-                p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size)
-
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
+        for group_index, _ in enumerate(self.param_groups):
+            self._group_step(group_index)
 
         return loss
+
+    def _init_param_state(self, p_data_fp32, state, amsgrad):
+        # State initialization
+        if len(state) == 0:
+            state["step"] = 0
+            # Exponential moving average of gradient values
+            state["exp_avg"] = torch.zeros_like(p_data_fp32)
+            # Exponential moving average of squared gradient values
+            state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+            if amsgrad:
+                # Maintains max of all exp. moving avg. of sq. grad. values
+                state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32)
+        else:
+            state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
+            state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
+            if amsgrad:
+                state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to(
+                    p_data_fp32
+                )
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+
+        amsgrad = group.get("amsgrad", False)
+        beta1, beta2 = group["betas"]
+
+        # combine tensors
+        self._maybe_init_combine_states()
+        self._maybe_init_combine_params_and_grads()
+
+        for combined_params_state, combined_params, combined_grads in zip(
+                self._combined_state_indexed_by_group[group_index],
+                self._combined_params_indexed_by_group[group_index],
+                self._combined_grads_indexed_by_group[group_index]):
+
+            if combined_params is None or combined_grads is None:
+                continue
+
+            # use grad with float32
+            # if combined_grads.dtype in {torch.float16, torch.bfloat16}:
+            #    combined_grads = combined_grads.float()
+            if combined_grads.is_sparse:
+                raise RuntimeError(
+                    "Adam does not support sparse gradients, please consider SparseAdam instead"
+                )
+
+            exp_avg, exp_avg_sq = combined_params_state["exp_avg"], combined_params_state["exp_avg_sq"]
+            if amsgrad:
+                max_exp_avg_sq = combined_params_state["max_exp_avg_sq"]
+
+            # Decay the first and second moment running average coefficient
+            exp_avg.mul_(beta1).add_(combined_grads, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(combined_grads, combined_grads, value=1 - beta2)
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                # Use the max. for normalizing running avg. of gradient
+                denom = max_exp_avg_sq.sqrt().add_(group["eps"])
+            else:
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
+
+            combined_params_state['step'] += 1
+            bias_correction1 = 1 - beta1 ** combined_params_state["step"]
+            bias_correction2 = 1 - beta2 ** combined_params_state["step"]
+            step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
+
+            if group["weight_decay"] != 0:
+                combined_params.add_(
+                    combined_params, alpha=-group["weight_decay"] * group["lr"]
+                )
+
+            combined_params.addcdiv_(exp_avg, denom, value=-step_size)
+
+    def _combine_group_param_states(self, group_index):
+        group = self.param_groups[group_index]
+        params = group['params']
+        amsgrad = group['amsgrad']
+
+        step_list = [[], []]
+        exp_avg_list = [[], []]
+        exp_avg_sq_list = [[], []]
+        max_exp_avg_sq_list = [[], []]
+
+        for p in params:
+            if p.grad is None:
+                continue
+            grad = p.grad.data
+            if grad.is_sparse:
+                raise RuntimeError(
+                    "Adam does not support sparse gradients, please consider SparseAdam instead"
+                )
+
+            state = self.state[p]
+            self._init_param_state(p.data, state, amsgrad)
+
+            kind = 0 if hasattr(p, 'expert') else 1
+            step_list[kind].append(state['step'])
+            exp_avg_list[kind].append(state['exp_avg'])
+            exp_avg_sq_list[kind].append(state['exp_avg_sq'])
+            if amsgrad:
+                max_exp_avg_sq_list[kind].append(state['max_exp_avg_sq'])
+
+        combined_state = [None, None]
+
+        for kind in range(len(combined_state)):
+            combined_step = 0
+            combined_exp_avg = None
+            combined_exp_avg_sq = None
+            combined_max_exp_avg_sq = None
+
+            if len(step_list[kind]) > 0:
+                combined_step = step_list[kind][0]
+                combined_exp_avg = npu_combine_tensors(exp_avg_list[kind])
+                combined_exp_avg_sq = npu_combine_tensors(exp_avg_sq_list[kind])
+                combined_max_exp_avg_sq = npu_combine_tensors(max_exp_avg_sq_list[kind])
+            else:
+                continue
+
+            combined_state[kind] = defaultdict(dict)
+            combined_state[kind]['step'] = combined_step
+            combined_state[kind]['exp_avg'] = combined_exp_avg
+            combined_state[kind]['exp_avg_sq'] = combined_exp_avg_sq
+            combined_state[kind]['max_exp_avg_sq'] = combined_max_exp_avg_sq
+
+        return combined_state
+
+    def _check_initialized(self, groups):
+        def _one_kind_initialized(pair):
+
+    return pair and (pair[0] is not None or pair[1] is not None)
+    return groups and any(map(_one_kind_initialized, groups))
+
+
+def _maybe_init_combine_states(self):
+    if self._check_initialized(self._combined_state_indexed_by_group):
+        return
+
+    self._combined_state_indexed_by_group = list(map(
+        self._combine_group_param_states,
+        range(len(self.param_groups))))
+
+
+def _combine_params_and_grads(self, group_index):
+    group = self.param_groups[group_index]
+    params = group['params']
+
+    group_params_list = [[], []]
+    group_grads_list = [[], []]
+
+    for p in params:
+        if p.grad is None:
+            continue
+        param_size = p.storage().size()
+        grad_size = p.grad.storage().size()
+
+        if param_size != grad_size:
+            p.grad.data = p.grad.data.clone()
+
+        kind = 0 if hasattr(p, 'expert') else 1
+        group_params_list[kind].append(p)
+        group_grads_list[kind].append(p.grad)
+
+    combined_params = [None, None]
+    combined_grads = [None, None]
+    for kind in range(len(combined_params)):
+        combined_params[kind] = npu_combine_tensors(group_params_list[kind])
+        combined_grads[kind] = npu_combine_tensors(group_grads_list[kind])
+
+    return combined_params, combined_grads
+
+
+def _maybe_init_combine_params_and_grads(self):
+    if self._check_initialized(self._combined_params_indexed_by_group) and \
+            self._check_initialized(self._combined_grads_indexed_by_group):
+        return
+
+    self._combined_params_indexed_by_group = []
+    self._combined_grads_indexed_by_group = []
+
+    for group_index in range(len(self.param_groups)):
+        combined_params, combined_grads = self._combine_params_and_grads(group_index)
+        self._combined_params_indexed_by_group.append(combined_params)
+        self._combined_grads_indexed_by_group.append(combined_grads)
+
+
+@torch.no_grad()
+def zero_grad(self, set_to_none=False):
+    if set_to_none:
+        raise ValueError(
+            "set_to_none is not supported in fused optimizers")
+
+    self._maybe_init_combine_params_and_grads()
+
+    for grad_pair in self._combined_grads_indexed_by_group:
+        for grad in grad_pair:
+            if grad is None:
+                continue
+            grad.zero_()
+
+
+@torch.no_grad()
+def clip_grad_norm_(self, params, max_norm, aggregate_norm_fn=None) -> torch.Tensor:
+    def grad_exists(p):
+        return p is not None and getattr(p, "grad", None) is not None
+
+    if isinstance(params, torch.Tensor):
+        params = [params]
+    params = list(params)
+    grads = [
+        p.grad for p in params if grad_exists(p) and not hasattr(p, "expert")
+    ]
+    expert_grads = [
+        p.grad for p in params if grad_exists(p) and hasattr(p, "expert")
+    ]
+
+    if len(grads) == 0:
+        if len(params) > 0:
+            return params[0].new_tensor(0.0)
+        else:
+            return torch.tensor(0.0)
+
+    if len(grads) == 1:
+        total_norm = torch.norm(grads[0].detach(), p=2, dtype=torch.float32)
+    else:
+        if torch.cuda.is_available():
+            device = torch.cuda.current_device()
+        elif grads[0].device.type == "xla":
+            device = grads[0].device
+        else:
+            device = torch.device("cpu")
+
+        combined_grads_without_expert = self._combined_grads_indexed_by_group[0][1]
+
+        if combined_grads_without_expert is None:
+            if len(params) > 0:
+                return params[0].new_tensor(0.0)
+            else:
+                return torch.tensor(0.0)
+
+        total_norm = torch.norm(combined_grads_without_expert, p=2, dtype=torch.float32).to(device)
+
+    if aggregate_norm_fn is not None:
+        total_norm = aggregate_norm_fn(total_norm)
+
+    if max_norm > 0:
+        max_norm = float(max_norm)
+        clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1)
+        for combined_grad in self._combined_grads_indexed_by_group[0]:
+            combined_grad.mul_(clip_coef)
+
+    return total_norm
+
+
+@torch.no_grad()
+def multiply_grads(self, c):
+    """Multiplies grads by a constant *c*."""
+    # combine tensors
+    self._maybe_init_combine_states()
+    self._maybe_init_combine_params_and_grads()
+
+    device = torch.cuda.current_device()
+    factor = c.to(device) if torch.is_tensor(c) else c
+    for combined_grad in self._combined_grads_indexed_by_group[0]:
+        if combined_grad is not None:
+            combined_grad.mul_(factor)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
index 8690ef98e0..11e2e8a4b6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
@@ -98,22 +98,11 @@ class FairseqOptimizer(object):
 
     def multiply_grads(self, c):
         """Multiplies grads by a constant *c*."""
-        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
-        for p in self.params:
-            if p.grad is not None:
-                if p.grad.is_sparse:
-                    p.grad.data.mul_(c.to(p.grad.device) if torch.is_tensor(c) else c)
-                else:
-                    per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(
-                        p.grad.data
-                    )
-        for device, per_dtype_grads in per_device_and_dtype_grads.items():
-            for grads in per_dtype_grads.values():
-                torch._foreach_mul_(grads, c.to(device) if torch.is_tensor(c) else c)
+        self.optimizer.multiply_grads(c)
 
     def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
         """Clips gradient norm."""
-        return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
+        return self.optimizer.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
 
     def step(self, closure=None, scale=1.0, groups=None):
         """Performs a single optimization step."""
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
index 7a2be389ff..5ed023040a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
@@ -529,7 +529,7 @@ class SequenceGenerator(nn.Module):
             # from the list of {2 * beam_size} candidates were
             # selected. Shapes: (batch size, beam size)
             new_cands_to_ignore, active_hypos = torch.topk(
-                active_mask, k=beam_size, dim=1, largest=False
+                active_mask.float(), k=beam_size, dim=1, largest=False
             )
 
             # update cands_to_ignore to ignore any finalized hypos.
@@ -544,7 +544,7 @@ class SequenceGenerator(nn.Module):
             active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
             active_scores = torch.gather(cand_scores, dim=1, index=active_hypos)
 
-            active_bbsz_idx = active_bbsz_idx.view(-1)
+            active_bbsz_idx = active_bbsz_idx.view(-1).to(torch.int64)
             active_scores = active_scores.view(-1)
 
             # copy tokens and scores for active hypotheses
@@ -647,6 +647,7 @@ class SequenceGenerator(nn.Module):
         """
         assert bbsz_idx.numel() == eos_scores.numel()
 
+        bbsz_idx = bbsz_idx.to(torch.int64)
         # clone relevant token and attention tensors.
         # tokens is (batch * beam, max_len). So the index_select
         # gets the newly EOS rows, then selects cols 1..{step + 2}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
index 20a618870a..da6a14f293 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
@@ -55,7 +55,7 @@ class Trainer(object):
         self.tpu = cfg.common.tpu
         self.cuda = torch.cuda.is_available() and not cfg.common.cpu and not self.tpu
         if self.cuda:
-            self.device = torch.device("cuda")
+            self.device = torch.device("npu")
         elif self.tpu:
             self.device = utils.get_tpu_device()
         else:
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
index 525ca13c0c..2d2bba6706 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
@@ -750,10 +750,8 @@ def xla_device_to_cpu(dat):
 class CudaEnvironment(object):
     def __init__(self):
         cur_device = torch.cuda.current_device()
-        prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device))
+        prop = torch.cuda.get_device_properties("npu:{}".format(cur_device))
         self.name = prop.name
-        self.major = prop.major
-        self.minor = prop.minor
         self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024
 
     @staticmethod
@@ -769,7 +767,6 @@ class CudaEnvironment(object):
         for r, env in enumerate(cuda_env_list):
             logger.info(
                 "rank {:3d}: ".format(r)
-                + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor)
                 + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB)
                 + "name = {:40s}".format(env.name)
             )
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
index 5939712050..0ef29864a7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
@@ -35,9 +35,12 @@ from fairseq.file_io import PathManager
 from fairseq.logging import meters, metrics, progress_bar
 from fairseq.model_parallel.megatron_trainer import MegatronTrainer
 from fairseq.trainer import Trainer
-
+import transfer_to_npu
+import torch_npu
 
 def main(cfg: FairseqConfig) -> None:
+    torch_npu.npu.set_compile_mode(jit_compile=False)
+
     if isinstance(cfg, argparse.Namespace):
         cfg = convert_namespace_to_omegaconf(cfg)
 
-- 
Gitee


From afc69a8f05ad37e79c20cc8f1fcb216fa687ac5e Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Mon, 12 Jun 2023 10:29:13 +0800
Subject: [PATCH 04/12] huawei license

---
 .../fairseq/__init__.py                       | 33 +++++++++++++++++++
 .../fairseq/benchmark/__init__.py             | 33 +++++++++++++++++++
 .../benchmark_multihead_attention.py          | 33 +++++++++++++++++++
 .../fairseq/benchmark/dummy_dataset.py        | 33 +++++++++++++++++++
 .../fairseq/benchmark/dummy_lm.py             | 33 +++++++++++++++++++
 .../fairseq/benchmark/dummy_masked_lm.py      | 33 +++++++++++++++++++
 .../fairseq/benchmark/dummy_model.py          | 33 +++++++++++++++++++
 .../fairseq/benchmark/dummy_mt.py             | 33 +++++++++++++++++++
 .../fairseq/binarizer.py                      | 33 +++++++++++++++++++
 .../fairseq/checkpoint_utils.py               | 33 +++++++++++++++++++
 .../fairseq/config/__init__.py                | 33 +++++++++++++++++++
 .../fairseq/criterions/__init__.py            | 33 +++++++++++++++++++
 .../fairseq/criterions/adaptive_loss.py       | 33 +++++++++++++++++++
 .../fairseq/criterions/composite_loss.py      | 33 +++++++++++++++++++
 .../fairseq/criterions/cross_entropy.py       | 33 +++++++++++++++++++
 .../fairseq/criterions/ctc.py                 | 33 +++++++++++++++++++
 .../fairseq/criterions/fairseq_criterion.py   | 33 +++++++++++++++++++
 .../fairseq/criterions/fastspeech2_loss.py    | 33 +++++++++++++++++++
 .../fairseq/criterions/hubert_criterion.py    | 33 +++++++++++++++++++
 .../label_smoothed_cross_entropy.py           | 33 +++++++++++++++++++
 ...moothed_cross_entropy_latency_augmented.py | 33 +++++++++++++++++++
 ...l_smoothed_cross_entropy_with_alignment.py | 33 +++++++++++++++++++
 .../label_smoothed_cross_entropy_with_ctc.py  | 33 +++++++++++++++++++
 ...label_smoothed_cross_entropy_with_rdrop.py | 33 +++++++++++++++++++
 .../fairseq/criterions/legacy_masked_lm.py    | 33 +++++++++++++++++++
 .../fairseq/criterions/masked_lm.py           | 33 +++++++++++++++++++
 .../fairseq/criterions/model_criterion.py     | 33 +++++++++++++++++++
 .../fairseq/criterions/nat_loss.py            | 33 +++++++++++++++++++
 .../fairseq/criterions/sentence_prediction.py | 33 +++++++++++++++++++
 .../sentence_prediction_adapters.py           | 33 +++++++++++++++++++
 .../fairseq/criterions/sentence_ranking.py    | 33 +++++++++++++++++++
 .../criterions/speech_dlm_criterion.py        | 33 +++++++++++++++++++
 .../criterions/speech_to_speech_criterion.py  | 33 +++++++++++++++++++
 .../criterions/speech_ulm_criterion.py        | 33 +++++++++++++++++++
 .../fairseq/criterions/tacotron2_loss.py      | 33 +++++++++++++++++++
 .../fairseq/criterions/wav2vec_criterion.py   | 33 +++++++++++++++++++
 .../fairseq/data/__init__.py                  | 33 +++++++++++++++++++
 .../fairseq/data/add_class_target_dataset.py  | 33 +++++++++++++++++++
 .../fairseq/data/add_target_dataset.py        | 33 +++++++++++++++++++
 .../fairseq/data/append_token_dataset.py      | 33 +++++++++++++++++++
 .../fairseq/data/audio/__init__.py            | 33 +++++++++++++++++++
 .../fairseq/data/audio/audio_utils.py         | 33 +++++++++++++++++++
 .../fairseq/data/audio/data_cfg.py            | 33 +++++++++++++++++++
 .../data/audio/dataset_transforms/__init__.py | 33 +++++++++++++++++++
 .../audio/dataset_transforms/concataugment.py | 33 +++++++++++++++++++
 .../dataset_transforms/noisyoverlapaugment.py | 33 +++++++++++++++++++
 .../data/audio/feature_transforms/__init__.py | 33 +++++++++++++++++++
 .../audio/feature_transforms/delta_deltas.py  | 33 +++++++++++++++++++
 .../audio/feature_transforms/global_cmvn.py   | 33 +++++++++++++++++++
 .../audio/feature_transforms/specaugment.py   | 33 +++++++++++++++++++
 .../feature_transforms/utterance_cmvn.py      | 33 +++++++++++++++++++
 .../data/audio/frm_text_to_speech_dataset.py  | 33 +++++++++++++++++++
 .../fairseq/data/audio/hubert_dataset.py      | 33 +++++++++++++++++++
 .../data/audio/multi_modality_dataset.py      | 33 +++++++++++++++++++
 .../fairseq/data/audio/raw_audio_dataset.py   | 33 +++++++++++++++++++
 .../data/audio/speech_to_speech_dataset.py    | 33 +++++++++++++++++++
 .../data/audio/speech_to_text_dataset.py      | 33 +++++++++++++++++++
 .../audio/speech_to_text_joint_dataset.py     | 33 +++++++++++++++++++
 .../data/audio/text_to_speech_dataset.py      | 33 +++++++++++++++++++
 .../audio/waveform_transforms/__init__.py     | 33 +++++++++++++++++++
 .../audio/waveform_transforms/noiseaugment.py | 33 +++++++++++++++++++
 .../fairseq/data/backtranslation_dataset.py   | 33 +++++++++++++++++++
 .../fairseq/data/base_wrapper_dataset.py      | 33 +++++++++++++++++++
 .../fairseq/data/bucket_pad_length_dataset.py | 33 +++++++++++++++++++
 .../fairseq/data/codedataset.py               | 33 +++++++++++++++++++
 .../fairseq/data/colorize_dataset.py          | 33 +++++++++++++++++++
 .../fairseq/data/concat_dataset.py            | 33 +++++++++++++++++++
 .../fairseq/data/concat_sentences_dataset.py  | 33 +++++++++++++++++++
 .../fairseq/data/data_utils.py                | 33 +++++++++++++++++++
 .../fairseq/data/denoising_dataset.py         | 33 +++++++++++++++++++
 .../fairseq/data/dictionary.py                | 33 +++++++++++++++++++
 .../fairseq/data/encoders/__init__.py         | 33 +++++++++++++++++++
 .../fairseq/data/encoders/byte_bpe.py         | 33 +++++++++++++++++++
 .../fairseq/data/encoders/byte_utils.py       | 33 +++++++++++++++++++
 .../fairseq/data/encoders/bytes.py            | 33 +++++++++++++++++++
 .../fairseq/data/encoders/characters.py       | 33 +++++++++++++++++++
 .../fairseq/data/encoders/fastbpe.py          | 33 +++++++++++++++++++
 .../fairseq/data/encoders/gpt2_bpe.py         | 33 +++++++++++++++++++
 .../fairseq/data/encoders/gpt2_bpe_utils.py   | 33 +++++++++++++++++++
 .../fairseq/data/encoders/hf_bert_bpe.py      | 33 +++++++++++++++++++
 .../fairseq/data/encoders/hf_byte_bpe.py      | 33 +++++++++++++++++++
 .../fairseq/data/encoders/moses_tokenizer.py  | 33 +++++++++++++++++++
 .../fairseq/data/encoders/nltk_tokenizer.py   | 33 +++++++++++++++++++
 .../data/encoders/sentencepiece_bpe.py        | 33 +++++++++++++++++++
 .../fairseq/data/encoders/space_tokenizer.py  | 33 +++++++++++++++++++
 .../fairseq/data/encoders/subword_nmt_bpe.py  | 33 +++++++++++++++++++
 .../fairseq/data/encoders/utils.py            | 33 +++++++++++++++++++
 .../fairseq/data/fairseq_dataset.py           | 33 +++++++++++++++++++
 .../fairseq/data/fasta_dataset.py             | 33 +++++++++++++++++++
 .../fairseq/data/huffman/__init__.py          | 33 +++++++++++++++++++
 .../fairseq/data/huffman/huffman_coder.py     | 33 +++++++++++++++++++
 .../huffman/huffman_mmap_indexed_dataset.py   | 33 +++++++++++++++++++
 .../fairseq/data/id_dataset.py                | 33 +++++++++++++++++++
 .../fairseq/data/indexed_dataset.py           | 33 +++++++++++++++++++
 .../fairseq/data/iterators.py                 | 33 +++++++++++++++++++
 .../fairseq/data/language_pair_dataset.py     | 33 +++++++++++++++++++
 .../fairseq/data/legacy/__init__.py           | 33 +++++++++++++++++++
 .../fairseq/data/legacy/block_pair_dataset.py | 33 +++++++++++++++++++
 .../fairseq/data/legacy/masked_lm_dataset.py  | 33 +++++++++++++++++++
 .../data/legacy/masked_lm_dictionary.py       | 33 +++++++++++++++++++
 .../fairseq/data/list_dataset.py              | 33 +++++++++++++++++++
 .../fairseq/data/lm_context_window_dataset.py | 33 +++++++++++++++++++
 .../fairseq/data/lru_cache_dataset.py         | 33 +++++++++++++++++++
 .../fairseq/data/mask_tokens_dataset.py       | 33 +++++++++++++++++++
 .../fairseq/data/monolingual_dataset.py       | 33 +++++++++++++++++++
 .../fairseq/data/multi_corpus_dataset.py      | 33 +++++++++++++++++++
 .../data/multi_corpus_sampled_dataset.py      | 33 +++++++++++++++++++
 .../fairseq/data/multilingual/__init__.py     | 33 +++++++++++++++++++
 .../multilingual/multilingual_data_manager.py | 33 +++++++++++++++++++
 .../data/multilingual/multilingual_utils.py   | 33 +++++++++++++++++++
 .../multilingual/sampled_multi_dataset.py     | 33 +++++++++++++++++++
 .../sampled_multi_epoch_dataset.py            | 33 +++++++++++++++++++
 .../data/multilingual/sampling_method.py      | 33 +++++++++++++++++++
 .../fairseq/data/nested_dictionary_dataset.py | 33 +++++++++++++++++++
 .../fairseq/data/noising.py                   | 33 +++++++++++++++++++
 .../fairseq/data/num_samples_dataset.py       | 33 +++++++++++++++++++
 .../fairseq/data/numel_dataset.py             | 33 +++++++++++++++++++
 .../fairseq/data/offset_tokens_dataset.py     | 33 +++++++++++++++++++
 .../fairseq/data/pad_dataset.py               | 33 +++++++++++++++++++
 .../fairseq/data/padding_mask_dataset.py      | 33 +++++++++++++++++++
 .../fairseq/data/plasma_utils.py              | 33 +++++++++++++++++++
 .../fairseq/data/prepend_dataset.py           | 33 +++++++++++++++++++
 .../fairseq/data/prepend_token_dataset.py     | 33 +++++++++++++++++++
 .../fairseq/data/raw_label_dataset.py         | 33 +++++++++++++++++++
 .../fairseq/data/replace_dataset.py           | 33 +++++++++++++++++++
 .../fairseq/data/resampling_dataset.py        | 33 +++++++++++++++++++
 .../fairseq/data/roll_dataset.py              | 33 +++++++++++++++++++
 .../fairseq/data/round_robin_zip_datasets.py  | 33 +++++++++++++++++++
 .../fairseq/data/shorten_dataset.py           | 33 +++++++++++++++++++
 .../fairseq/data/sort_dataset.py              | 33 +++++++++++++++++++
 .../fairseq/data/span_mask_tokens_dataset.py  | 33 +++++++++++++++++++
 .../fairseq/data/speech_dlm_dataset.py        | 33 +++++++++++++++++++
 .../fairseq/data/strip_token_dataset.py       | 33 +++++++++++++++++++
 .../fairseq/data/subsample_dataset.py         | 33 +++++++++++++++++++
 .../fairseq/data/text_compressor.py           | 33 +++++++++++++++++++
 .../fairseq/data/token_block_dataset.py       | 33 +++++++++++++++++++
 .../transform_eos_concat_langpair_dataset.py  | 33 +++++++++++++++++++
 .../fairseq/data/transform_eos_dataset.py     | 33 +++++++++++++++++++
 .../data/transform_eos_lang_pair_dataset.py   | 33 +++++++++++++++++++
 .../fairseq/dataclass/__init__.py             | 33 +++++++++++++++++++
 .../fairseq/dataclass/configs.py              | 33 +++++++++++++++++++
 .../fairseq/dataclass/constants.py            | 33 +++++++++++++++++++
 .../fairseq/dataclass/initialize.py           | 33 +++++++++++++++++++
 .../fairseq/dataclass/utils.py                | 33 +++++++++++++++++++
 .../fairseq/distributed/__init__.py           | 33 +++++++++++++++++++
 .../distributed_timeout_wrapper.py            | 33 +++++++++++++++++++
 .../fully_sharded_data_parallel.py            | 33 +++++++++++++++++++
 .../legacy_distributed_data_parallel.py       | 33 +++++++++++++++++++
 .../distributed/module_proxy_wrapper.py       | 33 +++++++++++++++++++
 .../tpu_distributed_data_parallel.py          | 33 +++++++++++++++++++
 .../fairseq/distributed/utils.py              | 33 +++++++++++++++++++
 .../fairseq/file_chunker_utils.py             | 33 +++++++++++++++++++
 .../fairseq/file_io.py                        | 33 +++++++++++++++++++
 .../fairseq/file_utils.py                     | 33 +++++++++++++++++++
 .../fairseq/hub_utils.py                      | 33 +++++++++++++++++++
 .../fairseq/incremental_decoding_utils.py     | 33 +++++++++++++++++++
 .../fairseq/iterative_refinement_generator.py | 33 +++++++++++++++++++
 .../fairseq/logging/__init__.py               | 33 +++++++++++++++++++
 .../fairseq/logging/meters.py                 | 33 +++++++++++++++++++
 .../fairseq/logging/metrics.py                | 33 +++++++++++++++++++
 .../fairseq/logging/progress_bar.py           | 33 +++++++++++++++++++
 .../fairseq/model_parallel/__init__.py        | 33 +++++++++++++++++++
 .../model_parallel/criterions/__init__.py     | 33 +++++++++++++++++++
 .../vocab_parallel_cross_entropy.py           | 33 +++++++++++++++++++
 .../model_parallel/megatron_trainer.py        | 33 +++++++++++++++++++
 .../fairseq/model_parallel/models/__init__.py | 33 +++++++++++++++++++
 .../pipeline_parallel_transformer/__init__.py | 33 +++++++++++++++++++
 .../pipeline_parallel_transformer/layers.py   | 33 +++++++++++++++++++
 .../pipeline_parallel_transformer/model.py    | 33 +++++++++++++++++++
 .../model_parallel/models/transformer.py      | 33 +++++++++++++++++++
 .../model_parallel/models/transformer_lm.py   | 33 +++++++++++++++++++
 .../model_parallel/modules/__init__.py        | 33 +++++++++++++++++++
 .../modules/multihead_attention.py            | 33 +++++++++++++++++++
 .../modules/transformer_layer.py              | 33 +++++++++++++++++++
 .../fairseq/models/__init__.py                | 33 +++++++++++++++++++
 .../fairseq/models/bart/__init__.py           | 33 +++++++++++++++++++
 .../fairseq/models/bart/hub_interface.py      | 33 +++++++++++++++++++
 .../fairseq/models/bart/model.py              | 33 +++++++++++++++++++
 .../fairseq/models/composite_encoder.py       | 33 +++++++++++++++++++
 .../models/distributed_fairseq_model.py       | 33 +++++++++++++++++++
 .../fairseq/models/ema/__init__.py            | 33 +++++++++++++++++++
 .../fairseq/models/ema/ema.py                 | 33 +++++++++++++++++++
 .../fairseq/models/fairseq_decoder.py         | 33 +++++++++++++++++++
 .../fairseq/models/fairseq_encoder.py         | 33 +++++++++++++++++++
 .../models/fairseq_incremental_decoder.py     | 33 +++++++++++++++++++
 .../fairseq/models/fairseq_model.py           | 33 +++++++++++++++++++
 .../fairseq/models/fconv.py                   | 33 +++++++++++++++++++
 .../fairseq/models/fconv_lm.py                | 33 +++++++++++++++++++
 .../fairseq/models/fconv_self_att.py          | 33 +++++++++++++++++++
 .../fairseq/models/hubert/__init__.py         | 33 +++++++++++++++++++
 .../fairseq/models/hubert/hubert.py           | 33 +++++++++++++++++++
 .../fairseq/models/hubert/hubert_asr.py       | 33 +++++++++++++++++++
 .../fairseq/models/huggingface/__init__.py    | 33 +++++++++++++++++++
 .../fairseq/models/huggingface/hf_gpt2.py     | 33 +++++++++++++++++++
 .../fairseq/models/lightconv.py               | 33 +++++++++++++++++++
 .../fairseq/models/lightconv_lm.py            | 33 +++++++++++++++++++
 .../fairseq/models/lstm.py                    | 33 +++++++++++++++++++
 .../fairseq/models/lstm_lm.py                 | 33 +++++++++++++++++++
 .../fairseq/models/masked_lm.py               | 33 +++++++++++++++++++
 .../fairseq/models/model_utils.py             | 33 +++++++++++++++++++
 .../models/multilingual_transformer.py        | 33 +++++++++++++++++++
 .../fairseq/models/nat/__init__.py            | 33 +++++++++++++++++++
 .../fairseq/models/nat/cmlm_transformer.py    | 33 +++++++++++++++++++
 .../fairseq/models/nat/fairseq_nat_model.py   | 33 +++++++++++++++++++
 .../models/nat/insertion_transformer.py       | 33 +++++++++++++++++++
 ...iterative_nonautoregressive_transformer.py | 33 +++++++++++++++++++
 .../models/nat/levenshtein_transformer.py     | 33 +++++++++++++++++++
 .../fairseq/models/nat/levenshtein_utils.py   | 33 +++++++++++++++++++
 .../fairseq/models/nat/nat_crf_transformer.py | 33 +++++++++++++++++++
 .../models/nat/nonautoregressive_ensembles.py | 33 +++++++++++++++++++
 .../nat/nonautoregressive_transformer.py      | 33 +++++++++++++++++++
 .../fairseq/models/roberta/__init__.py        | 33 +++++++++++++++++++
 .../fairseq/models/roberta/alignment_utils.py | 33 +++++++++++++++++++
 .../fairseq/models/roberta/enc_dec.py         | 33 +++++++++++++++++++
 .../fairseq/models/roberta/hub_interface.py   | 33 +++++++++++++++++++
 .../fairseq/models/roberta/model.py           | 33 +++++++++++++++++++
 .../fairseq/models/roberta/model_camembert.py | 33 +++++++++++++++++++
 .../fairseq/models/roberta/model_gottbert.py  | 33 +++++++++++++++++++
 .../fairseq/models/roberta/model_xlmr.py      | 33 +++++++++++++++++++
 .../fairseq/models/speech_dlm/__init__.py     | 33 +++++++++++++++++++
 .../models/speech_dlm/hub_interface.py        | 33 +++++++++++++++++++
 .../models/speech_dlm/modules/__init__.py     | 33 +++++++++++++++++++
 .../speech_dlm/modules/speech_dlm_decoder.py  | 33 +++++++++++++++++++
 .../modules/speech_dlm_decoder_layer.py       | 33 +++++++++++++++++++
 .../speech_dlm/sequence_generator/__init__.py | 33 +++++++++++++++++++
 .../sequence_generator/multichannel_search.py | 33 +++++++++++++++++++
 .../multichannel_sequence_generator.py        | 33 +++++++++++++++++++
 .../fairseq/models/speech_dlm/speech_dlm.py   | 33 +++++++++++++++++++
 .../models/speech_to_speech/__init__.py       | 33 +++++++++++++++++++
 .../speech_to_speech/modules/__init__.py      | 33 +++++++++++++++++++
 .../speech_to_speech/modules/ctc_decoder.py   | 33 +++++++++++++++++++
 .../modules/stacked_embedding.py              | 33 +++++++++++++++++++
 .../modules/transformer_decoder_aug.py        | 33 +++++++++++++++++++
 .../modules/transformer_encoder.py            | 33 +++++++++++++++++++
 .../models/speech_to_speech/s2s_conformer.py  | 33 +++++++++++++++++++
 .../s2s_conformer_translatotron2.py           | 33 +++++++++++++++++++
 .../speech_to_speech/s2s_conformer_unity.py   | 33 +++++++++++++++++++
 .../speech_to_speech/s2s_transformer.py       | 33 +++++++++++++++++++
 .../fairseq/models/speech_to_text/__init__.py | 33 +++++++++++++++++++
 .../fairseq/models/speech_to_text/berard.py   | 33 +++++++++++++++++++
 .../models/speech_to_text/convtransformer.py  | 33 +++++++++++++++++++
 .../models/speech_to_text/hub_interface.py    | 33 +++++++++++++++++++
 .../models/speech_to_text/modules/__init__.py | 33 +++++++++++++++++++
 .../modules/augmented_memory_attention.py     | 33 +++++++++++++++++++
 .../speech_to_text/modules/convolution.py     | 33 +++++++++++++++++++
 .../models/speech_to_text/modules/emformer.py | 33 +++++++++++++++++++
 .../speech_to_text/multi_modality_model.py    | 33 +++++++++++++++++++
 .../models/speech_to_text/s2t_conformer.py    | 33 +++++++++++++++++++
 .../models/speech_to_text/s2t_transformer.py  | 33 +++++++++++++++++++
 .../speech_to_text/s2t_wav_transformer.py     | 33 +++++++++++++++++++
 .../fairseq/models/speech_to_text/utils.py    | 33 +++++++++++++++++++
 .../models/speech_to_text/xm_transformer.py   | 33 +++++++++++++++++++
 .../speech_to_text/xm_transformer_unity.py    | 33 +++++++++++++++++++
 .../fairseq/models/text_to_speech/__init__.py | 33 +++++++++++++++++++
 .../models/text_to_speech/codehifigan.py      | 33 +++++++++++++++++++
 .../models/text_to_speech/fastspeech2.py      | 33 +++++++++++++++++++
 .../fairseq/models/text_to_speech/hifigan.py  | 33 +++++++++++++++++++
 .../models/text_to_speech/hub_interface.py    | 33 +++++++++++++++++++
 .../models/text_to_speech/tacotron2.py        | 33 +++++++++++++++++++
 .../models/text_to_speech/tts_transformer.py  | 33 +++++++++++++++++++
 .../fairseq/models/text_to_speech/vocoder.py  | 33 +++++++++++++++++++
 .../fairseq/models/transformer/__init__.py    | 33 +++++++++++++++++++
 .../models/transformer/transformer_base.py    | 33 +++++++++++++++++++
 .../models/transformer/transformer_config.py  | 33 +++++++++++++++++++
 .../models/transformer/transformer_decoder.py | 33 +++++++++++++++++++
 .../transformer/transformer_decoder_aug.py    | 33 +++++++++++++++++++
 .../models/transformer/transformer_encoder.py | 33 +++++++++++++++++++
 .../models/transformer/transformer_legacy.py  | 33 +++++++++++++++++++
 .../fairseq/models/transformer_align.py       | 33 +++++++++++++++++++
 .../models/transformer_from_pretrained_xlm.py | 33 +++++++++++++++++++
 .../fairseq/models/transformer_lm.py          | 33 +++++++++++++++++++
 .../fairseq/models/transformer_ulm.py         | 33 +++++++++++++++++++
 .../fairseq/models/wav2vec/__init__.py        | 33 +++++++++++++++++++
 .../fairseq/models/wav2vec/utils.py           | 33 +++++++++++++++++++
 .../fairseq/models/wav2vec/wav2vec.py         | 33 +++++++++++++++++++
 .../fairseq/models/wav2vec/wav2vec2.py        | 33 +++++++++++++++++++
 .../fairseq/models/wav2vec/wav2vec2_asr.py    | 33 +++++++++++++++++++
 .../fairseq/models/wav2vec/wav2vec2_laser.py  | 33 +++++++++++++++++++
 .../fairseq/models/xmod/__init__.py           | 33 +++++++++++++++++++
 .../fairseq/models/xmod/hub_interface.py      | 33 +++++++++++++++++++
 .../fairseq/models/xmod/model.py              | 33 +++++++++++++++++++
 .../models/xmod/transformer_layer_xmod.py     | 33 +++++++++++++++++++
 .../fairseq/modules/__init__.py               | 33 +++++++++++++++++++
 .../fairseq/modules/adaptive_input.py         | 33 +++++++++++++++++++
 .../fairseq/modules/adaptive_softmax.py       | 33 +++++++++++++++++++
 .../fairseq/modules/base_layer.py             | 33 +++++++++++++++++++
 .../fairseq/modules/beamable_mm.py            | 33 +++++++++++++++++++
 .../modules/character_token_embedder.py       | 33 +++++++++++++++++++
 .../fairseq/modules/checkpoint_activations.py | 33 +++++++++++++++++++
 .../fairseq/modules/conformer_layer.py        | 33 +++++++++++++++++++
 .../fairseq/modules/conv_tbc.py               | 33 +++++++++++++++++++
 .../fairseq/modules/cross_entropy.py          | 33 +++++++++++++++++++
 .../downsampled_multihead_attention.py        | 33 +++++++++++++++++++
 .../fairseq/modules/dynamic_convolution.py    | 33 +++++++++++++++++++
 .../fairseq/modules/dynamic_crf_layer.py      | 33 +++++++++++++++++++
 .../modules/dynamicconv_layer/__init__.py     | 33 +++++++++++++++++++
 .../dynamicconv_layer/cuda_function_gen.py    | 33 +++++++++++++++++++
 .../dynamicconv_layer/dynamicconv_layer.py    | 33 +++++++++++++++++++
 .../modules/dynamicconv_layer/setup.py        | 33 +++++++++++++++++++
 .../fairseq/modules/ema_module.py             | 33 +++++++++++++++++++
 .../modules/espnet_multihead_attention.py     | 33 +++++++++++++++++++
 .../fairseq/modules/fairseq_dropout.py        | 33 +++++++++++++++++++
 .../fairseq/modules/fp32_batch_norm.py        | 33 +++++++++++++++++++
 .../fairseq/modules/fp32_group_norm.py        | 33 +++++++++++++++++++
 .../fairseq/modules/fp32_instance_norm.py     | 33 +++++++++++++++++++
 .../fairseq/modules/gelu.py                   | 33 +++++++++++++++++++
 .../fairseq/modules/grad_multiply.py          | 33 +++++++++++++++++++
 .../modules/gumbel_vector_quantizer.py        | 33 +++++++++++++++++++
 .../fairseq/modules/kmeans_attention.py       | 33 +++++++++++++++++++
 .../modules/kmeans_vector_quantizer.py        | 33 +++++++++++++++++++
 .../fairseq/modules/layer_drop.py             | 33 +++++++++++++++++++
 .../fairseq/modules/layer_norm.py             | 33 +++++++++++++++++++
 .../modules/learned_positional_embedding.py   | 33 +++++++++++++++++++
 .../modules/lightconv_layer/__init__.py       | 33 +++++++++++++++++++
 .../lightconv_layer/cuda_function_gen.py      | 33 +++++++++++++++++++
 .../lightconv_layer/lightconv_layer.py        | 33 +++++++++++++++++++
 .../fairseq/modules/lightconv_layer/setup.py  | 33 +++++++++++++++++++
 .../modules/lightweight_convolution.py        | 33 +++++++++++++++++++
 .../fairseq/modules/linearized_convolution.py | 33 +++++++++++++++++++
 .../fairseq/modules/location_attention.py     | 33 +++++++++++++++++++
 .../fairseq/modules/lstm_cell_with_zoneout.py | 33 +++++++++++++++++++
 .../fairseq/modules/multihead_attention.py    | 33 +++++++++++++++++++
 .../fairseq/modules/positional_embedding.py   | 33 +++++++++++++++++++
 .../fairseq/modules/positional_encoding.py    | 33 +++++++++++++++++++
 .../fairseq/modules/quant_noise.py            | 33 +++++++++++++++++++
 .../fairseq/modules/quantization/__init__.py  | 33 +++++++++++++++++++
 .../modules/quantization/pq/__init__.py       | 33 +++++++++++++++++++
 .../fairseq/modules/quantization/pq/em.py     | 33 +++++++++++++++++++
 .../quantization/pq/modules/__init__.py       | 33 +++++++++++++++++++
 .../modules/quantization/pq/modules/qconv.py  | 33 +++++++++++++++++++
 .../modules/quantization/pq/modules/qemb.py   | 33 +++++++++++++++++++
 .../quantization/pq/modules/qlinear.py        | 33 +++++++++++++++++++
 .../fairseq/modules/quantization/pq/pq.py     | 33 +++++++++++++++++++
 .../fairseq/modules/quantization/pq/utils.py  | 33 +++++++++++++++++++
 .../quantization/quantization_options.py      | 33 +++++++++++++++++++
 .../modules/quantization/scalar/__init__.py   | 33 +++++++++++++++++++
 .../quantization/scalar/modules/__init__.py   | 33 +++++++++++++++++++
 .../quantization/scalar/modules/qact.py       | 33 +++++++++++++++++++
 .../quantization/scalar/modules/qconv.py      | 33 +++++++++++++++++++
 .../quantization/scalar/modules/qemb.py       | 33 +++++++++++++++++++
 .../quantization/scalar/modules/qlinear.py    | 33 +++++++++++++++++++
 .../modules/quantization/scalar/ops.py        | 33 +++++++++++++++++++
 .../modules/quantization/scalar/utils.py      | 33 +++++++++++++++++++
 .../modules/rotary_positional_embedding.py    | 33 +++++++++++++++++++
 .../fairseq/modules/same_pad.py               | 33 +++++++++++++++++++
 .../fairseq/modules/scalar_bias.py            | 33 +++++++++++++++++++
 .../sinusoidal_positional_embedding.py        | 33 +++++++++++++++++++
 .../modules/sparse_multihead_attention.py     | 33 +++++++++++++++++++
 .../sparse_transformer_sentence_encoder.py    | 33 +++++++++++++++++++
 ...arse_transformer_sentence_encoder_layer.py | 33 +++++++++++++++++++
 .../fairseq/modules/transformer_layer.py      | 33 +++++++++++++++++++
 .../fairseq/modules/transformer_layer_aug.py  | 33 +++++++++++++++++++
 .../modules/transformer_sentence_encoder.py   | 33 +++++++++++++++++++
 .../transformer_sentence_encoder_layer.py     | 33 +++++++++++++++++++
 .../fairseq/modules/transpose_last.py         | 33 +++++++++++++++++++
 .../fairseq/modules/unfold.py                 | 33 +++++++++++++++++++
 .../fairseq/modules/vggblock.py               | 33 +++++++++++++++++++
 .../fairseq/nan_detector.py                   | 33 +++++++++++++++++++
 .../fairseq/ngram_repeat_block.py             | 33 +++++++++++++++++++
 .../fairseq/optim/__init__.py                 | 33 +++++++++++++++++++
 .../fairseq/optim/adadelta.py                 | 33 +++++++++++++++++++
 .../fairseq/optim/adafactor.py                | 33 +++++++++++++++++++
 .../fairseq/optim/adagrad.py                  | 33 +++++++++++++++++++
 .../fairseq/optim/adam.py                     | 33 +++++++++++++++++++
 .../fairseq/optim/adamax.py                   | 33 +++++++++++++++++++
 .../fairseq/optim/amp_optimizer.py            | 33 +++++++++++++++++++
 .../fairseq/optim/bmuf.py                     | 33 +++++++++++++++++++
 .../fairseq/optim/composite.py                | 33 +++++++++++++++++++
 .../fairseq/optim/cpu_adam.py                 | 33 +++++++++++++++++++
 .../fairseq/optim/dynamic_loss_scaler.py      | 33 +++++++++++++++++++
 .../fairseq/optim/fairseq_optimizer.py        | 33 +++++++++++++++++++
 .../fairseq/optim/fp16_optimizer.py           | 33 +++++++++++++++++++
 .../fairseq/optim/fused_adam.py               | 33 +++++++++++++++++++
 .../fairseq/optim/fused_lamb.py               | 33 +++++++++++++++++++
 .../fairseq/optim/lr_scheduler/__init__.py    | 33 +++++++++++++++++++
 .../optim/lr_scheduler/cosine_lr_scheduler.py | 33 +++++++++++++++++++
 .../lr_scheduler/fairseq_lr_scheduler.py      | 33 +++++++++++++++++++
 .../optim/lr_scheduler/fixed_schedule.py      | 33 +++++++++++++++++++
 .../inverse_square_root_schedule.py           | 33 +++++++++++++++++++
 .../optim/lr_scheduler/manual_lr_scheduler.py | 33 +++++++++++++++++++
 .../optim/lr_scheduler/pass_through.py        | 33 +++++++++++++++++++
 .../lr_scheduler/polynomial_decay_schedule.py | 33 +++++++++++++++++++
 .../lr_scheduler/reduce_lr_on_plateau.py      | 33 +++++++++++++++++++
 .../optim/lr_scheduler/step_lr_scheduler.py   | 33 +++++++++++++++++++
 .../lr_scheduler/tri_stage_lr_scheduler.py    | 33 +++++++++++++++++++
 .../lr_scheduler/triangular_lr_scheduler.py   | 33 +++++++++++++++++++
 .../fairseq/optim/nag.py                      | 33 +++++++++++++++++++
 .../fairseq/optim/sgd.py                      | 33 +++++++++++++++++++
 .../fairseq/optim/shard.py                    | 33 +++++++++++++++++++
 .../fairseq/options.py                        | 33 +++++++++++++++++++
 .../fairseq/pdb.py                            | 33 +++++++++++++++++++
 .../fairseq/quantization_utils.py             | 33 +++++++++++++++++++
 .../fairseq/registry.py                       | 33 +++++++++++++++++++
 .../fairseq/scoring/__init__.py               | 33 +++++++++++++++++++
 .../fairseq/scoring/bertscore.py              | 33 +++++++++++++++++++
 .../fairseq/scoring/bleu.py                   | 33 +++++++++++++++++++
 .../fairseq/scoring/chrf.py                   | 33 +++++++++++++++++++
 .../fairseq/scoring/meteor.py                 | 33 +++++++++++++++++++
 .../fairseq/scoring/tokenizer.py              | 33 +++++++++++++++++++
 .../fairseq/scoring/wer.py                    | 33 +++++++++++++++++++
 .../fairseq/search.py                         | 33 +++++++++++++++++++
 .../fairseq/sequence_generator.py             | 33 +++++++++++++++++++
 .../fairseq/sequence_scorer.py                | 33 +++++++++++++++++++
 .../fairseq/speech_generator.py               | 33 +++++++++++++++++++
 .../fairseq/tasks/__init__.py                 | 33 +++++++++++++++++++
 .../fairseq/tasks/audio_finetuning.py         | 33 +++++++++++++++++++
 .../fairseq/tasks/audio_pretraining.py        | 33 +++++++++++++++++++
 .../fairseq/tasks/cross_lingual_lm.py         | 33 +++++++++++++++++++
 .../fairseq/tasks/denoising.py                | 33 +++++++++++++++++++
 .../fairseq/tasks/fairseq_task.py             | 33 +++++++++++++++++++
 .../fairseq/tasks/frm_text_to_speech.py       | 33 +++++++++++++++++++
 .../fairseq/tasks/hubert_pretraining.py       | 33 +++++++++++++++++++
 .../fairseq/tasks/language_modeling.py        | 33 +++++++++++++++++++
 .../fairseq/tasks/legacy_masked_lm.py         | 33 +++++++++++++++++++
 .../fairseq/tasks/masked_lm.py                | 33 +++++++++++++++++++
 .../fairseq/tasks/multilingual_denoising.py   | 33 +++++++++++++++++++
 .../tasks/multilingual_language_modeling.py   | 33 +++++++++++++++++++
 .../fairseq/tasks/multilingual_masked_lm.py   | 33 +++++++++++++++++++
 .../fairseq/tasks/multilingual_translation.py | 33 +++++++++++++++++++
 .../fairseq/tasks/nlu_finetuning.py           | 33 +++++++++++++++++++
 .../fairseq/tasks/online_backtranslation.py   | 33 +++++++++++++++++++
 .../tasks/semisupervised_translation.py       | 33 +++++++++++++++++++
 .../fairseq/tasks/sentence_prediction.py      | 33 +++++++++++++++++++
 .../tasks/sentence_prediction_adapters.py     | 33 +++++++++++++++++++
 .../fairseq/tasks/sentence_ranking.py         | 33 +++++++++++++++++++
 .../fairseq/tasks/simultaneous_translation.py | 33 +++++++++++++++++++
 .../fairseq/tasks/span_masked_lm.py           | 33 +++++++++++++++++++
 .../fairseq/tasks/speech_dlm_task.py          | 33 +++++++++++++++++++
 .../fairseq/tasks/speech_to_speech.py         | 33 +++++++++++++++++++
 .../fairseq/tasks/speech_to_text.py           | 33 +++++++++++++++++++
 .../fairseq/tasks/speech_ulm_task.py          | 33 +++++++++++++++++++
 .../fairseq/tasks/text_to_speech.py           | 33 +++++++++++++++++++
 .../fairseq/tasks/translation.py              | 33 +++++++++++++++++++
 .../tasks/translation_from_pretrained_bart.py | 33 +++++++++++++++++++
 .../tasks/translation_from_pretrained_xlm.py  | 33 +++++++++++++++++++
 .../fairseq/tasks/translation_lev.py          | 33 +++++++++++++++++++
 .../tasks/translation_multi_simple_epoch.py   | 33 +++++++++++++++++++
 .../fairseq/token_generation_constraints.py   | 33 +++++++++++++++++++
 .../fairseq/tokenizer.py                      | 33 +++++++++++++++++++
 .../fairseq/trainer.py                        | 33 +++++++++++++++++++
 .../fairseq/utils.py                          | 33 +++++++++++++++++++
 .../fairseq_cli/__init__.py                   | 33 +++++++++++++++++++
 .../fairseq_cli/eval_lm.py                    | 33 +++++++++++++++++++
 .../fairseq_cli/generate.py                   | 33 +++++++++++++++++++
 .../fairseq_cli/hydra_train.py                | 33 +++++++++++++++++++
 .../fairseq_cli/hydra_validate.py             | 33 +++++++++++++++++++
 .../fairseq_cli/interactive.py                | 33 +++++++++++++++++++
 .../fairseq_cli/preprocess.py                 | 33 +++++++++++++++++++
 .../fairseq_cli/score.py                      | 33 +++++++++++++++++++
 .../fairseq_cli/train.py                      | 33 +++++++++++++++++++
 .../fairseq_cli/validate.py                   | 33 +++++++++++++++++++
 .../hubconf.py                                | 33 +++++++++++++++++++
 .../release_utils.py                          | 33 +++++++++++++++++++
 .../scripts/__init__.py                       | 33 +++++++++++++++++++
 .../scripts/average_checkpoints.py            | 33 +++++++++++++++++++
 .../scripts/build_sym_alignment.py            | 33 +++++++++++++++++++
 .../scripts/check_installation.py             | 33 +++++++++++++++++++
 .../scripts/compare_namespaces.py             | 33 +++++++++++++++++++
 .../scripts/constraints/extract.py            | 33 +++++++++++++++++++
 .../scripts/constraints/validate.py           | 33 +++++++++++++++++++
 .../scripts/count_docs.py                     | 33 +++++++++++++++++++
 .../scripts/read_binarized.py                 | 33 +++++++++++++++++++
 .../scripts/rm_pt.py                          | 33 +++++++++++++++++++
 .../scripts/shard_docs.py                     | 33 +++++++++++++++++++
 .../scripts/split_train_valid_docs.py         | 33 +++++++++++++++++++
 .../scripts/spm_decode.py                     | 33 +++++++++++++++++++
 .../scripts/spm_encode.py                     | 33 +++++++++++++++++++
 .../scripts/spm_train.py                      | 33 +++++++++++++++++++
 .../setup.py                                  | 33 +++++++++++++++++++
 .../train.py                                  | 33 +++++++++++++++++++
 470 files changed, 15510 insertions(+)

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
index ee4fd20304..a78bc13e4d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
index 30d5d51bbd..d47523ced7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 # import models/tasks to register them
 from . import dummy_dataset, dummy_lm, dummy_masked_lm, dummy_model, dummy_mt  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
index 6553ba4aed..f4b98ac71d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/benchmark_multihead_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import itertools
 import random
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py
index 2f051754af..a826778531 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import numpy as np
 from fairseq.data import FairseqDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
index 2f2336b2ce..bac8de0abc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
index 7571a86811..a8c903a189 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
index 0b2325aa16..a5945b87d7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
index d77e1deeda..fa19076077 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/benchmark/dummy_mt.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
index c148402cc0..de8d45b1f8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/binarizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
index 4f492ce143..6755f6bccf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/checkpoint_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import ast
 import collections
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/config/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
index 1f4baec525..3460f863e0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
index 50d4c997a6..ffcb1899d4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/adaptive_loss.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
index be8a1c89c8..1e2fe77a0f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/composite_loss.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq import utils
 from fairseq.criterions import LegacyFairseqCriterion, register_criterion
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
index bbb0424aed..c610f9e719 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/cross_entropy.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py
index 368213cb2b..e422a28c86 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/ctc.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # All rights reserved.
 #
 # This source code is licensed under the license found in the LICENSE file in
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
index eed86feafa..1ed44e0990 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fairseq_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import inspect
 from typing import Any, Dict, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py
index ab7cd08e3b..8df0f8dec0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/fastspeech2_loss.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
index b99f9c8e01..1c026faa58 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/hubert_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 import re
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
index a5477e0975..d847552578 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
index 459d435bf8..b6f3ee5b8b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
index df1e1953a2..a5df94bca6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
index 92935d9c80..1bddf57172 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
index 819250d5f3..a922dc7020 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
index 8e54d0feb3..a7dd27231d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/legacy_masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
index fedba8eaed..bebf915aed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
index ebdf81f643..7e8d8b7190 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/model_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
index 3b1c38ee65..f706356ec7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/nat_loss.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
index 837439a646..da3cfcfc32 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
index 9d2c65e6fb..7d7fe4b464 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_prediction_adapters.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
index 8b0898422c..c793d8e294 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/sentence_ranking.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
index 6c7bc0e250..3d207e43fc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_dlm_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
index a9438f6b6c..95ee9b036c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_to_speech_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
index 8a9ef6ac27..cefa5b6210 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/speech_ulm_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py
index 4113fdc548..9f258ef27f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/tacotron2_loss.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
index 794a39184b..87f96ebbc9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/criterions/wav2vec_criterion.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
index 98726903ad..0a75cc0001 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 from .dictionary import Dictionary, TruncatedDictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
index 4f9c9972e5..e23ef91875 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_class_target_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
index 60419466b1..8a11c43c68 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/add_target_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
index 026e4811ce..d118735fb4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/append_token_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py
index dff90fadfc..a665e214e3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 from abc import ABC, abstractmethod
 from typing import Dict, Optional
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
index 121cca60c5..a4332679df 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/audio_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import mmap
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
index 34595cc1bc..3718fd2553 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/data_cfg.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from argparse import Namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py
index b24c6f731f..4ca2add9fd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import os
 from fairseq.data.audio import (
     AudioTransform,
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py
index 0b632ccf2b..b9b02085c3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/concataugment.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 from typing import List
 import numpy as np
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py
index e9ebec2388..c9ad406350 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import numpy as np
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py
index d295013b90..22a5e696df 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import os
 from fairseq.data.audio import (
     AudioTransform,
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py
index 49d090b11e..37af92ec3c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/delta_deltas.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import numpy as np
 import torch
 from fairseq.data.audio.feature_transforms import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py
index e457ff176f..0c95193db2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/global_cmvn.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import numpy as np
 from fairseq.data.audio.feature_transforms import (
     AudioFeatureTransform,
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py
index ce5802b41a..482acbd111 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/specaugment.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import math
 import numbers
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
index 37637bc09a..07b59672e1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import numpy as np
 
 from fairseq.data.audio.feature_transforms import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py
index b54654d492..add800ed7a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/frm_text_to_speech_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
index c3c4a4eae5..5de536d539 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/hubert_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py
index 0a42c10611..748da1b359 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/multi_modality_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2021-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
index 52f8cfd7fd..6d7317e559 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/raw_audio_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
index d9bbb76ec8..7df2a1a1ca 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_speech_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
index d2a71d16fa..1269469aa6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import csv
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
index 7b8145ac48..aab41f8067 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/speech_to_text_joint_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py
index 13612b458b..abee9b8217 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/text_to_speech_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py
index 57f8bb571b..fe502fef5f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import os
 from fairseq.data.audio import (
     AudioTransform,
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py
index 401ce30943..2362532b9b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/audio/waveform_transforms/noiseaugment.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 from pathlib import Path
 import numpy as np
 from math import ceil
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
index dc386c9b4a..057562305c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/backtranslation_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from fairseq import utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
index 414ba0b6c9..274b6f2c09 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/base_wrapper_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from torch.utils.data.dataloader import default_collate
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
index 014fc284a6..87e8a494d5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/bucket_pad_length_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
index 77618c5796..1f6b4780e7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/codedataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import json
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
index e88ce9bada..698d9f671f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/colorize_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
index 105506864c..df61472c8e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import bisect
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
index 2bb37b018e..c6a5df5ec1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/concat_sentences_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
index d156d7b455..8eac21d046 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 try:
     from collections.abc import Iterable
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
index 1c83436d46..eadb1f68a0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/denoising_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
index e6101dbd17..1781651a3a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/dictionary.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import os
 from collections import Counter
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
index 837c1ba299..4c730db220 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
index 4b7c80ee5a..56dff0fe1a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_bpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
index 1ea23b0db9..23ff3fa850 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/byte_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import re
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
index 2ce946895f..35d178e413 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/bytes.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from fairseq.data.encoders import register_bpe
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
index 77a1a7885d..4943eb3740 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/characters.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from fairseq.data.encoders import register_bpe
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
index 36bf93e423..2f6927a70e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/fastbpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
index 10327002f7..235192edf5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py
index 996d3d4a11..04f67ff414 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/gpt2_bpe_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 Byte pair encoding utilities from GPT-2.
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
index 2e7c7bba84..0779d748e5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_bert_bpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
index eaa46b9373..e57015c800 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/hf_byte_bpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
index 7df4654fcd..55c0c1577b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/moses_tokenizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
index 44ad4e3677..77d78c3a1c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/nltk_tokenizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.data.encoders import register_tokenizer
 from fairseq.dataclass import FairseqDataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
index 48d310ac9d..ef234e9199 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/sentencepiece_bpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
index 24bc3cea75..fe2c5b0c66 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/space_tokenizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import re
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
index 7b04ebe537..4073267767 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/subword_nmt_bpe.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
index d108773aeb..740b570d5c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/encoders/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from fairseq.data import encoders
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
index 3b731c622a..89fdf813b3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fairseq_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import numpy as np
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
index d07b42c7e2..8a7ca4778e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/fasta_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import os
 import subprocess
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
index 61b339e1c2..16dd8114aa 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .huffman_coder import HuffmanCodeBuilder, HuffmanCoder
 from .huffman_mmap_indexed_dataset import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
index 9a4989e543..3bf25f5cd0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_coder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import re
 import typing as tp
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
index 7d1a58912c..8a573fa8c2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/huffman/huffman_mmap_indexed_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import mmap
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
index 85387c6ed2..64745fdc33 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/id_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
index d938888b5d..4f01650847 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/indexed_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import shutil
 import struct
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
index f7f8ae1b99..263bd1d335 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/iterators.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
index cae8ccd16c..cf4c86cc84 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/language_pair_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
index 19d47fb207..4ca6844475 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .block_pair_dataset import BlockPairDataset
 from .masked_lm_dataset import MaskedLMDataset
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
index 221311f4c1..c5f5cf324e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/block_pair_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
index f4c895eebd..7f5842424d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Dict, List, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
index 635ce3866b..1ac12c8d23 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/legacy/masked_lm_dictionary.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.data import Dictionary
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
index 0f3db5b4d7..8b3e9ef22a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/list_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
index 190b89e320..1df52d831d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lm_context_window_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
index d187afa1ca..47953c0698 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/lru_cache_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from functools import lru_cache
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
index 3852277b72..cbaed7e12e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/mask_tokens_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from functools import lru_cache
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
index 3df617bc5e..4b8f711e3e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/monolingual_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
index af1830a17c..5313d2a063 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import asyncio
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
index d601881c82..c26e379a34 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multi_corpus_sampled_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import OrderedDict
 from typing import Callable, Dict, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
index 75f041b419..af452eb126 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_data_manager.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import itertools
 import json
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py
index b4e0f9828c..2862dab0a1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/multilingual_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 from enum import Enum
 from typing import Dict, List, Optional, Sequence
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
index e378ca578f..ef58cf8c5b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import datetime
 import hashlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
index 33538ae7c3..6c46daba01 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import hashlib
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
index 7fcaddf751..fd5e2072a2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/multilingual/sampling_method.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
index 97a919878b..547495628f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/nested_dictionary_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import OrderedDict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
index 5ed11cf83f..b21ada3463 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/noising.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
index da6d29fa2c..bfe5ea1ca1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/num_samples_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import FairseqDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
index 35b27d9261..4933dbc985 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/numel_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
index 559e98d998..0085fd770f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/offset_tokens_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
index 705033977d..bda4925ee3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/pad_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.data import data_utils
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
index e4aaa80743..05ba02bb10 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/padding_mask_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
index ba492e2a18..3250b5eda3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/plasma_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import hashlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
index efe919fee9..ed5a5dbf6e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
index 587edd6726..44bd1f6d23 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/prepend_token_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
index 58632ee3e3..2308651239 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/raw_label_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
index 1b33cc29d2..55716d95a9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/replace_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
index bc1056503e..f5c7a06779 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/resampling_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
index 46604a0351..c734e4aaa8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/roll_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
index 845d46b5fd..0e3eb9be2e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/round_robin_zip_datasets.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from collections import OrderedDict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
index f0e8dc2798..908d9a262d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/shorten_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 from fairseq.data import data_utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
index 3000cadefd..a643d2ce0b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/sort_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
index 4b9fc93d0b..383833d4dd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/span_mask_tokens_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
index e1816066d8..8b39b6c026 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/speech_dlm_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import OrderedDict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
index 88457b846b..9ae8d4fbb8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/strip_token_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import BaseWrapperDataset
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
index edb3a1bfc5..e8c8847e49 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/subsample_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import contextlib
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
index e48c802941..2fdfc0eab5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/text_compressor.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from enum import Enum
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
index 7a92af4267..d90df417c3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
index aa1f4bc3ef..9ed9dad435 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_concat_langpair_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
index a0acc0f9b5..bf8631bdcb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
index 561696dcc3..e118a379e4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/transform_eos_lang_pair_dataset.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
index 370c944db4..1d8b6bf68b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .configs import FairseqDataclass
 from .constants import ChoiceEnum
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
index 8e4fbfa12e..212edfafe3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/configs.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import os
 import sys
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
index d7844a36f9..f6e51737e9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/constants.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from enum import Enum, EnumMeta
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
index 646225ae42..a07221da06 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/initialize.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
index 7c8ce075a4..efe3f54290 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/dataclass/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import ast
 import inspect
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
index b51cb7ab5b..4846fdb25f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .distributed_timeout_wrapper import DistributedTimeoutWrapper
 from .fully_sharded_data_parallel import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
index dc84d60db7..90bce28456 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/distributed_timeout_wrapper.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
index 731e740f77..a148a9460e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/fully_sharded_data_parallel.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import contextlib
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
index 715cdf5334..e1da49baea 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/legacy_distributed_data_parallel.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 A modified version of the legacy DistributedDataParallel module that uses c10d
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
index 950c1e6469..0b2965a4f0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/module_proxy_wrapper.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from torch import nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
index 2d39ca583b..55039c1919 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/tpu_distributed_data_parallel.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
index 9afe8d1ece..3d7f6be4d1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/distributed/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import io
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
index c83f42f0f5..f555094b13 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_chunker_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import os
 import typing as tp
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
index 468c281c82..39d3372d06 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_io.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
index 9204fbd02c..2f59962451 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/file_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 Utilities for working with the local dataset cache.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
index bb9b730a77..f054cad4a2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/hub_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
index f1c2b0cce4..6b34f1658b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/incremental_decoding_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import uuid
 from typing import Dict, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
index 99edc43447..9738e1f947 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/iterative_refinement_generator.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import namedtuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
index 41372dcc0d..1d13dbd3ee 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/meters.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import bisect
 import time
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
index d16e13b4bf..d20344fa0d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/metrics.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 A standalone module for aggregating metrics.
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
index b9d2916ea8..25609ec66c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/logging/progress_bar.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 Wrapper around various loggers and progress bars (e.g., tqdm).
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
index 4811d61956..30142192dc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import criterions, models, modules  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
index 8cd368c35d..1ae41bec25 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
index ef15075284..7805e60706 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
index b7c6acb6f6..f69bfc2f24 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/megatron_trainer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 Train a network across multiple GPUs.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
index 99de782fac..200f797a16 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
index 71785a3745..9798eb26e5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
index 47f3accd60..a3f7ee5fd4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from collections import namedtuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
index 29d1ed0f13..5c633e2006 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
index 942c45eaac..3228232634 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
index 2fb56d312e..3c70c5f20d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/models/transformer_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
index 27839ae72d..2b971dbbc2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 from .multihead_attention import ModelParallelMultiheadAttention
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
index c71156071f..c34c92312d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/multihead_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
index 62e8a8581f..1a657a79ba 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/model_parallel/modules/transformer_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
 from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
index 06e22aa403..b58949f7ed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
index 1c9ce80f68..e67630b09f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .hub_interface import *  # noqa
 from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
index 4c09efbaca..026c5a7052 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/hub_interface.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
index 20635b5c6e..eee7611057 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/bart/model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 BART: Denoising Sequence-to-Sequence Pre-training for
 Natural Language Generation, Translation, and Comprehension
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
index 6354049675..79449cdffb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/composite_encoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .fairseq_encoder import FairseqEncoder
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
index 921656ac27..70b55cc724 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/distributed_fairseq_model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
index 9257a1a3c1..8b4f16e0ad 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py
index 472d5d5f17..92f3961fa2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/ema/ema.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
index 4328ca8689..fca7d8ae17 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_decoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
index e1e3d0c948..3f3ab57b61 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_encoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, NamedTuple, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
index bac6d8a5a2..a383629e60 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_incremental_decoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from typing import Dict, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
index 3682e633f8..c0b963b953 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fairseq_model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 Base classes for various fairseq models.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
index a0b97cfc20..9e89e9a86a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
index cb2db7a6de..7bf4ccff44 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq import utils
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
index 59be5a9bb5..9749108d89 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/fconv_self_att.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
index 7f04042e5f..198aecbf58 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .hubert import *  # noqa
 from .hubert_asr import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
index 757a897eff..a64193acf3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
index 4328c11fd6..196d0593d3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/hubert/hubert_asr.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import contextlib
 import copy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
index 7416ecaba3..cd3b4ed584 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import importlib
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
index 25c47c811b..d642271dee 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/huggingface/hf_gpt2.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
index fd26bac53c..29d7969af2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
index 0f27805e09..b80d127c40 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lightconv_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq import utils
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
index 82d7a55efb..3c31f8a3aa 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
index 5dbfa472fd..205a5d3ed7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/lstm_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq import utils
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
index a357ed8245..e4f0da15f2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
index 6d2807a26e..3ab3d303c0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/model_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
index ce03b68e71..4a90961938 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/multilingual_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import OrderedDict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
index e7788467c5..59847f4bd5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 from .fairseq_nat_model import *
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
index 784a834260..a700fc2b44 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/cmlm_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 This file implements:
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
index b03a77f952..02372227a2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/fairseq_nat_model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
index f8c83e1b89..b0c64365ec 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/insertion_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
index 037c4d44d8..9565e3e441 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from fairseq.models import register_model, register_model_architecture
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
index aee7669fcb..989f4f264e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
index 660e3dc6a1..7b028aa349 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/levenshtein_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from fairseq.utils import new_arange
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
index c255baebb0..0a40af7638 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nat_crf_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from fairseq.models import register_model, register_model_architecture
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
index fa1510e7d1..3c39cbb5b2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_ensembles.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
index c0b519c463..dcea0707c7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/nat/nonautoregressive_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
index 7ce4f3cfe8..b6aa5cb1d0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .hub_interface import *  # noqa
 from .model import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
index 8b3d56f652..762ba40197 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/alignment_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import Counter
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py
index e538dee0aa..8ccf714a1b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/enc_dec.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import argparse
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
index 618ef5247c..7b199e4454 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/hub_interface.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
index 9284634168..26bd2dad91 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 RoBERTa: A Robustly Optimized BERT Pretraining Approach.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
index be645eeca6..665404bbc8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_camembert.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 CamemBERT: a Tasty French Language Model
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
index a5c6bbd09a..b1dcd0f7de 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_gottbert.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 GottBERT: a pure German Language Model
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
index aa6a86d084..91022c9bed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/roberta/model_xlmr.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 Unsupervised Cross-lingual Representation Learning at Scale
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
index 6613b1dbe0..db9f762972 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .speech_dlm import *  # noqa
 from .hub_interface import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
index 76a449680a..eeac150776 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/hub_interface.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 
 import copy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
index 9c43a7c56c..6396380e21 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
index 8442bc6e78..5879ff539b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, Tuple, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
index 0f03a0c9d0..baf166a3ec 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .multichannel_sequence_generator import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
index edbe182c2f..c995a0f7e3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
index 94c2d83b5c..65cfde5ebe 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Dict, List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
index 7a3c8186fb..797161db75 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_dlm/speech_dlm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
index f302e4b658..799b23750e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .s2s_conformer import *  # noqa
 from .s2s_conformer_translatotron2 import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
index 057a104e90..9b32e7cf77 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/ctc_decoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from torch import nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
index b7cfabf371..a91cf89f81 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/stacked_embedding.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
index 923d79f1df..c4de1eda7a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Any, Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
index 79c11f6b1d..1f5f0f4b76 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/modules/transformer_encoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
index 47eb65a74a..ca8d1ebda8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
index 42c320bdea..439a844a2a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
index bf904ff106..51d64a7e1e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_conformer_unity.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
index 534c1df73b..d3780afbed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_speech/s2s_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
index 0e6b7da038..fd5871c5bf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .berard import *  # noqa
 from .convtransformer import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py
index 107ac983c6..8e6fa06230 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/berard.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 from ast import literal_eval
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
index e5945bcad3..aa05468205 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/convtransformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
index c4d683b2cf..e083eb0ce9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/hub_interface.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from argparse import Namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
index ec95a6d0a4..869c7c087c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/augmented_memory_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import List, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
index 0739a19d18..045878370d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/convolution.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py
index 935d593078..2766185396 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/modules/emformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
index e40f818571..51abef6f27 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/multi_modality_model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.models import FairseqDecoder, FairseqEncoder
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
index 4b57ed110f..de0ad64078 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_conformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
index 6d16d47d08..108c9e5955 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py
index ad21aeeb1a..1b15588917 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/s2t_wav_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py
index 33117446a5..1360f35bd1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
index ce780faca5..83202e92af 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
index 0f05b5ee80..4fc70a189f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/speech_to_text/xm_transformer_unity.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import copy
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
index ad3cf31d50..46aae1211b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .tacotron2 import *  # noqa
 from .tts_transformer import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py
index d1574dd63f..1303819f0b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/codehifigan.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 from argparse import Namespace
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
index c3117829c3..4ecdeee034 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/fastspeech2.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py
index a852beefec..8c2146486a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hifigan.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
index d5f3d117e0..1b253cfa8a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/hub_interface.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import random
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
index 9f0876c20c..06af9e5ebd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tacotron2.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
index d7541068f4..3c65cd86f1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/tts_transformer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from typing import List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
index 173e1a9415..60b0a87baf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/text_to_speech/vocoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import json
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py
index 681fca3d45..0259a2fdc4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) Facebook Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
index a10c8bd0a9..b03816e328 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_base.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
index 1fef3e6ebc..09b6dd51a8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_config.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import re
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
index 97e0c44dc6..195d7c4b32 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Any, Dict, List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
index 15816a0e4f..db2a9462c6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_decoder_aug.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Any, Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
index c738a6ddea..a47ce24126 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_encoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Dict, List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
index 955fb77abd..8f69eeb93b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer/transformer_legacy.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.dataclass.utils import gen_parser_from_dataclass
 from fairseq.models import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
index e93aa2e46c..90f07ae731 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_align.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.models import register_model, register_model_architecture
 from fairseq.models.transformer import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
index 6a679ebccd..4415728f9e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_from_pretrained_xlm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import os
 from typing import Any, Dict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
index e0ae91d8f4..af3b457c34 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
index f4536bce80..ce04d392b6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/transformer_ulm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
index 315081a63c..ce58938eab 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .wav2vec import *  # noqa
 from .wav2vec2 import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
index b2f3730ff2..c1e359f3e9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
index 65e3f65ba7..af3cfbbf05 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
index 8acbd75f8b..d9db04530e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
index aeda70e6f9..dfa85e8062 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_asr.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import contextlib
 import copy
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
index 95febcdaec..035b3e1cdc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/wav2vec/wav2vec2_laser.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.models import BaseFairseqModel, register_model
 from fairseq.models.wav2vec.wav2vec2_asr import (
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
index c36afbb194..5caa98575e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .model import *  # noqa
 from .transformer_layer_xmod import *  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
index ca460f8ee9..9759e199ef 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/hub_interface.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from fairseq.models.roberta.hub_interface import RobertaHubInterface
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
index 585af7e3ec..007d4a2139 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/model.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from ..roberta.model_xlmr import XLMRModel
 from fairseq.models.xmod.transformer_layer_xmod import XMODTransformerEncoderLayerBase
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
index bdacdaf1c2..f30a661475 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/models/xmod/transformer_layer_xmod.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.modules.transformer_layer import TransformerEncoderLayer
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
index a784b18d2d..8eb4eeb2c1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 from .adaptive_input import AdaptiveInput
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
index fad619c529..1e4ef1c6c8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_input.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
index 02fedf8632..af879901da 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/adaptive_softmax.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import functools
 import operator
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
index fb70a72013..8b06b46f26 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/base_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
index 73adf6497e..653d653b5a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/beamable_mm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
index dc686c8cf5..c99690c218 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/character_token_embedder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from typing import List, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
index ec48a0dbff..3eaf27a835 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/checkpoint_activations.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import functools
 from typing import Any, Dict, List, Tuple, Union
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
index 3cf75bad3f..fd2836090c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conformer_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from typing import Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
index bafcc41fd1..677a3754cc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/conv_tbc.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
index 508a0820e8..a7ef5e9c1d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/cross_entropy.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
index 5a478e9a21..f3bc9dbc25 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/downsampled_multihead_attention.py
@@ -1,4 +1,37 @@
 #
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
index 1034f1f546..03dd1a72b3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_convolution.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
index d6d7cd5b8e..b1726ce610 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamic_crf_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 This file is to re-implemented the low-rank and beam approximation of CRF layer
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
index 57bfe55a90..669fe70839 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .dynamicconv_layer import DynamicconvLayer  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
index 387f8e1266..d4f2eb690c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 def gen_forward():
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
index 141cfc18b6..a2371a6ec8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import dynamicconv_cuda
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
index ffb2d071de..d3cfc7deb1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/setup.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 from setuptools import setup
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py
index f0ece842d4..9a464d5e72 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/ema_module.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py
index 82bc0d7b45..cc138e57c7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/espnet_multihead_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
index b5c5a698fa..5544dde351 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fairseq_dropout.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from typing import List, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
index 7c34652947..10b9788e2e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_batch_norm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 batch norm done in fp32 (for fp16 training)
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
index 0f2306ab3e..faef57c084 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_group_norm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 Layer norm done in fp32 (for fp16 training)
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
index cbbdb9f338..708bb11413 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/fp32_instance_norm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 Layer norm done in fp32 (for fp16 training)
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
index 9d2ef16c68..8d4e60d634 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gelu.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
 the corresponding GitHub repo: https://github.com/hendrycks/GELUs
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
index 592a9cede7..09eb0a71e3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/grad_multiply.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
index 8869cb2339..572f8b39ec 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/gumbel_vector_quantizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py
index 0088d1ebdc..a0535f79f6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import math
 from functools import reduce, wraps
 from inspect import isfunction
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
index 5e59f31648..99f6283543 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/kmeans_vector_quantizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
index b25b3f494d..26ba93215e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_drop.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 LayerDrop as described in https://arxiv.org/abs/1909.11556.
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
index 1e9d63d562..fd0bbae15c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/layer_norm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
index c96eca6402..fea7755301 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/learned_positional_embedding.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
index b207235666..a437053e67 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .lightconv_layer import LightconvLayer  # noqa
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
index 3e3d2d9383..abe8e13eef 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 def gen_forward():
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
index 202ba55b48..b3bab11545 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/lightconv_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import lightconv_cuda
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
index b236a12935..ba7662fc47 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightconv_layer/setup.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 from setuptools import setup
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
index 4ce73e7dd5..29339e4b74 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lightweight_convolution.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
index ef69db2afc..f056d0e6e3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/linearized_convolution.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
index 1b59c68a50..f24019c093 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/location_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
index 53220737f3..e3d21d6080 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/lstm_cell_with_zoneout.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
index e6c99cf35d..5f3844c662 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/multihead_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Dict, List, Optional, Tuple
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
index f12cb9eef3..51b44afb30 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_embedding.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
index 7d3a30fa93..57790b1134 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/positional_encoding.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
index c5ce383931..e822f5a974 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quant_noise.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
index 90e4f2beae..34b252bcdd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .utils import SizeTracker, get_param, attrsetter, quantize_model_  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
index 190db6c933..c38b2c750a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/em.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
index 44f8121900..8d0fbe72d4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .qconv import PQConv2d  # NOQA
 from .qemb import PQEmbedding  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
index 3687f2c7d4..f6358615ff 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qconv.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
index 91ac168673..d9649aff40 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qemb.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
index 5b4753d4f8..18ac3bef86 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/modules/qlinear.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
index ef7b237187..dbe6cfbb7e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/pq.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .em import EM, EmptyClusterResolveError
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
index 7983cbf4c0..2582301287 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/pq/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import re
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
index 7b5effef20..473856c8f2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/quantization_options.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 def parse_config_yaml(yaml_data):
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
index e66f535e9c..f7b29973e6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/__init__.py
@@ -1,2 +1,35 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .utils import quantize_model_  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
index bea430328a..e15cbdd660 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from .qact import ActivationQuantizer  # NOQA
 from .qconv import IntConv2d  # NOQA
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
index 38aa5a9677..507d6e6dfc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qact.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
index 7ae37cc551..d7531021cd 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qconv.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn.functional as F
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
index bc63747729..abf3d452da 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qemb.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
index 599aa61639..5a6e5347bc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/modules/qlinear.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.nn as nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
index 7bfae4bf36..a44abc37ab 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/ops.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
index 2c0c38f0a6..1823e7bd70 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/quantization/scalar/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from operator import attrgetter
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py
index 84b88984ea..359095bc40 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/rotary_positional_embedding.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import torch
 
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
index 8a9eeb69fe..6c938364f5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/same_pad.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from torch import nn
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
index ee350ff283..1b742b4bd8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/scalar_bias.py
@@ -1,4 +1,37 @@
 #
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
+#
 
 import torch
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
index f072e0e875..59a0c24519 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sinusoidal_positional_embedding.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from typing import Any, Optional
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
index da4175f01c..bbd303e9f3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_multihead_attention.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
index c9b7cdf192..aab6ab41b8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn as nn
 from fairseq.modules import TransformerSentenceEncoder
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
index b3a4cfb6e4..bf314cefd3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.modules import TransformerSentenceEncoderLayer
 from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
index ccb20da6e5..9ab423bc97 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
index 87b0752644..cf244e4d7e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_layer_aug.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Dict, List, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
index cb2ae64b9a..2609ca4200 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Optional, Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
index 10a4a58a20..9dc5d8e333 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transformer_sentence_encoder_layer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Callable, Optional
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
index 1f0380fbd2..5d2c69d756 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/transpose_last.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 transpose last 2 dimensions of the input
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
index 67dab04396..13eb73338e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/unfold.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.nn.functional as F
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
index f10d9b20d7..5afe4ea69b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/vggblock.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
index 28b2129f74..e47806bf66 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/nan_detector.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py
index 4eb5030311..eb2db02be4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/ngram_repeat_block.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Originally from Microsoft Corporation.
 # Licensed under the MIT License.
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
index 9bfd3dabb8..0870219ac2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
index bb33c73656..34091fb771 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adadelta.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.optim
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
index b6f0f494bd..bf647967ae 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adafactor.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
index 0f609b798c..e6985225df 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adagrad.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.optim
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
index d3f711c938..14bcbd797d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
index f8420c24cd..fdb345b392 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adamax.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 import torch.optim
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
index d2c274a957..e4def1afa3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/amp_optimizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
index 2187e613a6..0516c1a9db 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/bmuf.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
index fb8e16044a..b767ef5b51 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/composite.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from collections import defaultdict
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
index dfb70c94e3..ba5445820a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/cpu_adam.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import importlib
 from collections.abc import Collection
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
index 81bee4e264..6c200ba3f0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/dynamic_loss_scaler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 class DynamicLossScaler(object):
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
index 11e2e8a4b6..1cc841406d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from fairseq import utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
index db16fc4b47..cc5f8f53e6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fp16_optimizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections import defaultdict
 from itertools import chain
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
index 4ac74811a2..eb3a55e631 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_adam.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import types
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
index 7fae6939d3..17c6d8cc39 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fused_lamb.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from fairseq.optim import LegacyFairseqOptimizer, register_optimizer
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
index 089a251457..b3220be15e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
index 73bbb9ed67..31fc856f48 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from collections.abc import Collection
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
index c0571d8618..2bdf79547c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from argparse import Namespace
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
index da801ae6cd..5e993c96aa 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/fixed_schedule.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 from typing import Optional, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
index cf987f656c..ee3ca3e339 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections.abc import Collection
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
index 8bdfc64aad..81f8a25157 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/manual_lr_scheduler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from . import LegacyFairseqLRScheduler, register_lr_scheduler
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
index 49053811e5..eea7826b41 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/pass_through.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
index 83134915dd..54354a17a9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 from typing import Optional, List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
index 2c92d189ef..76ab09cf1b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 from typing import List
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
index 7ca71d270d..df7ecb8746 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/step_lr_scheduler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections.abc import Collection
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
index 0c9f77a231..34404eaff1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
index 167daebcb9..4583e88e83 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
index d28b4b8c28..f918024a2c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/nag.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from collections.abc import Collection
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
index 310d9e35a9..1466b0c8f0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/sgd.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch.optim
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
index 23b3ef5e06..be206fa235 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/shard.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from typing import Any, Dict
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
index 72e41dc82e..9611e85117 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/options.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import argparse
 from pathlib import Path
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
index 46ab8ff74d..1f2a91b236 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/pdb.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import multiprocessing
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
index 4eceba59cb..a905af1bdf 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/quantization_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
index 257cdb6baf..8ca8ef967b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/registry.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from argparse import Namespace
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
index e22fea534d..a11833a86e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 import importlib
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
index eb9ee211d5..65b25d0100 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bertscore.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
index 5670814393..d72701e19e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/bleu.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import ctypes
 import math
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
index f583573b13..6e3ce8ffa9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/chrf.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
index 3d29e9fe04..cbd02e64d8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/meteor.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 from dataclasses import dataclass
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
index ffd1fbaf39..11dd14f8ef 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/tokenizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import unicodedata
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
index de315d7425..e0c4bf3e64 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/scoring/wer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
index 53f426fcb5..d74bfe9584 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/search.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
index 5ed023040a..21976c33ea 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_generator.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import math
 import sys
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
index 569da0ef3b..d5e72d85ed 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/sequence_scorer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import sys
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
index b78306fb06..33e9821d80 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/speech_generator.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import numpy as np
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
index 41e923b127..c762b4ea1a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/__init__.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py
index 77634f1bae..60ddcbfab9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_finetuning.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py
index e6de0666ad..8e73715a50 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/audio_pretraining.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
index 8a3731f177..fd07009e25 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/cross_lingual_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
index 7d75160f08..d3aebc2c9d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/denoising.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
index 160b442097..a6e1092700 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/fairseq_task.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
index 6df8ab9c77..b2eb49a07d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/frm_text_to_speech.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py
index 1a3605f14d..e2a1ce486b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/hubert_pretraining.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
index f529f7f317..a6a4459022 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/language_modeling.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
index 1e30303a26..4bad75dfc3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/legacy_masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import itertools
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
index d6c5dbe7a0..eb9717acb5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
index de177fe53c..5a6c00101d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_denoising.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import logging
 import os
 from dataclasses import dataclass, field
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
index 61b375cb7c..1c381b2de2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_language_modeling.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
index 1f63687366..6ebbb1de28 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
index 3461fa5630..d0b272b0f2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/multilingual_translation.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import contextlib
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py
index a335021335..9ddfa84343 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/nlu_finetuning.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
index 8cb92e99a4..4f13fa3b8c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/online_backtranslation.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import contextlib
 import json
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
index da20e781d0..4077283e92 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/semisupervised_translation.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
index 9fc8e8c1a9..ae8122fb6c 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
index 1fa4ce114c..1769b47be7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_prediction_adapters.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
index 83bdfc2246..524e85433f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/sentence_ranking.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
index 10ce4be313..76522c92e2 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/simultaneous_translation.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from fairseq.tasks import register_task
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
index cef59ac4ec..a9b3d6c2b5 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/span_masked_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
index aebcda366c..037c1cfd81 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_dlm_task.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
index ea4cd5b9c7..329b252a85 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_speech.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import json
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
index 24287ce001..c6b8fc6466 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_to_text.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 from argparse import Namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py
index b9d3019d50..0fafde5b09 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/speech_ulm_task.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # Copyright (c) 2017-present, Facebook, Inc.
 # All rights reserved.
 #
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
index 5bcd78ea9e..f1d63410f4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/text_to_speech.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import logging
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
index 5391e6fe12..322dcb1fde 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 import itertools
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
index 679f875813..65fe48a5a8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_bart.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import torch
 from fairseq import utils
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
index 202af52cc3..9ab68a5f60 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_from_pretrained_xlm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass
 from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
index 516bc21c59..ad687da7f8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_lev.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 from dataclasses import dataclass, field
 import torch
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
index 86ed5a7694..48bd008ce7 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tasks/translation_multi_simple_epoch.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import datetime
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
index dffd4d1769..ad83637d59 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/token_generation_constraints.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """Implements tracking of constraints for a beam item.
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
index d57cbb64c9..424873d4c0 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/tokenizer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import re
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
index da6a14f293..089c9f94c1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 """
 Train a network across multiple GPUs.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
index 2d2bba6706..c848f203fc 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 
 import argparse
 import collections
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
index 9c2b93659f..c5c92e5085 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/eval_lm.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 
 """
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
index aa7077e984..4e2db47ede 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/generate.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 """
 Translate pre-processed data with a trained model.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
index 557533650c..2df2c7141d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_train.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
index a4ec0a836b..73d11022f1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/hydra_validate.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
index 0d96d8b65e..70b13a21da 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/interactive.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 """
 Translate raw text with a trained model. Batches data on-the-fly.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
index 06d54a094f..935d04aa26 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/preprocess.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 """
 Data pre-processing: build vocabularies and binarize training data.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
index f74f7c30af..9a7f0ab1e4 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/score.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 """
 BLEU scoring of generated translations against reference translations.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
index 0ef29864a7..ab008e074a 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 """
 Train a new model on one or across multiple GPUs.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
index 591f356195..457469450e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/validate.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 
 import logging
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
index 3a146356ed..f72b5f33f8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/hubconf.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """isort:skip_file"""
 
 import functools
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py
index 69a5e8dda3..0eba0f7445 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/release_utils.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 import argparse
 from typing import Tuple
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py
index e69de29bb2..2556cc4229 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/__init__.py
@@ -0,0 +1,33 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
index 26a8c42fd4..fa3228dc16 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/average_checkpoints.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
index 5f237ea210..3d7a56afc3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/build_sym_alignment.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 """
 Use this script in order to build symmetric alignments for your translation
 dataset.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py
index e5a9d9dd46..38ed0b3bb9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/check_installation.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 from pathlib import Path
 import os
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py
index bc24db624f..5e182b72af 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/compare_namespaces.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python
 """Helper script to compare two argparse.Namespace objects."""
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
index 39907e163d..19c5f1eacb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/extract.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 #
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
index 03ac4b9920..c92e76863b 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/constraints/validate.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 #
 
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
index e43b58996d..6e585be172 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/count_docs.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 """
 Count the number of documents and average number of lines and tokens per
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
index 876d517629..78fdea4640 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/read_binarized.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
index 33226c34dd..17fbbc6dfb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/rm_pt.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 import argparse
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
index 1db23b58b9..8b3ddf28da 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/shard_docs.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 """
 Split a large file into shards while respecting document boundaries. Documents
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
index 709b7eaa60..acb298422d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/split_train_valid_docs.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 """
 Split a large file into a train and valid set while respecting document
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py
index 7d7b68b240..43db29a6eb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_decode.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py
index f91e0bb728..cb57decda8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_encode.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py
index 9db668fd41..d9ad042e14 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/scripts/spm_train.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
index 9c895f74c5..f50d3db308 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3
 
 import os
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
index 70506ef4dc..d23f42e143 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/train.py
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 #!/usr/bin/env python3 -u
 """
 Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead.
-- 
Gitee


From e8c72d76766cb8b7d5481d49a1558dc2b735f979 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Mon, 12 Jun 2023 11:38:49 +0800
Subject: [PATCH 05/12] run file

---
 .../fairseq_cli/train.py                      |  2 +-
 .../test/train_full_1p.sh                     | 47 ++++++++++++++++++
 .../test/train_full_8p.sh                     | 49 +++++++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
index ab008e074a..b840f12091 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq_cli/train.py
@@ -68,7 +68,7 @@ from fairseq.file_io import PathManager
 from fairseq.logging import meters, metrics, progress_bar
 from fairseq.model_parallel.megatron_trainer import MegatronTrainer
 from fairseq.trainer import Trainer
-import transfer_to_npu
+from torch_npu.contrib import transfer_to_npu
 import torch_npu
 
 def main(cfg: FairseqConfig) -> None:
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
new file mode 100644
index 0000000000..cbc9ed0546
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+    CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+export PYTHONPATH=$(pwd)/../fairseq:$PYTHONPATH
+
+LOG_FILE=train_1p_npu.log
+
+fairseq-train \
+    data-bin/wmt18_en_de \
+    --distributed-world-size 1 \
+    --arch transformer_wmt_en_de --share-decoder-input-output-embed \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
+    --dropout 0.3 --weight-decay 0.0001 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --max-tokens 4000 \
+    --eval-bleu \
+    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
+    --eval-bleu-detok moses \
+    --eval-bleu-remove-bpe \
+    --eval-bleu-print-samples \
+    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
+    --max-expoch 30 \
+    --save-dir checkpoints/zhuance > ${LOG_FILE} 2>&1 &
+
+if [ ! -f ${LOG_FILE}]; then
+    echo "No log file"
+    exit -1
+fi
+
+echo "---------------- Final Result ----------------"
+BEST_BLEU=`grep -P "INFO \| valid.+valid on 'valid' subset" ${LOG_FILE} | tail -n 1 | sed -E 's/.*best_bleu ([0-9]+\.[0-9]+)/\1/'`
+WPS=`grep -P "INFO \| train_inner.+wps" ${LOG_FILE} | tail -n 1 | sed -E 's/.+wps=([0-9]+).+/\1/'`
+echo "Best BLEU: ${BEST_BLEU}"
+echo "WPS: ${WPS}"
\ No newline at end of file
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
new file mode 100644
index 0000000000..2de48343f0
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+    CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+export PYTHONPATH=$(pwd)/../fairseq:$PYTHONPATH
+
+export _DEFAULT_FIRST_BUCKET_BYTES=50
+
+LOG_FILE=train_8p_npu.log
+
+fairseq-train \
+    data-bin/wmt18_en_de \
+    --distributed-world-size 8 \
+    --arch transformer_wmt_en_de --share-decoder-input-output-embed \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
+    --dropout 0.3 --weight-decay 0.0001 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --max-tokens 4000 \
+    --eval-bleu \
+    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
+    --eval-bleu-detok moses \
+    --eval-bleu-remove-bpe \
+    --eval-bleu-print-samples \
+    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
+    --max-expoch 30 \
+    --save-dir checkpoints/zhuance --bucket-cap-mb=300 > ${LOG_FILE} 2>&1 &
+
+if [ ! -f ${LOG_FILE} ]; then
+    echo "No log file"
+    exit -1
+fi
+
+echo "---------------- Final Result ----------------"
+BEST_BLEU=`grep -P "INFO \| valid.+valid on 'valid' subset" ${LOG_FILE} | tail -n 1 | sed -E 's/.*best_bleu ([0-9]+\.[0-9]+)/\1/'`
+WPS=`grep -P "INFO \| train_inner.+wps" ${LOG_FILE} | tail -n 1 | sed -E 's/.+wps=([0-9]+).+/\1/'`
+echo "Best BLEU: ${BEST_BLEU}"
+echo "WPS: ${WPS}"
-- 
Gitee


From 80c0edbf39944c54583965c658e4a5d3477f1ff0 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Mon, 12 Jun 2023 15:05:18 +0800
Subject: [PATCH 06/12] readme

---
 .../README.md                                 | 144 ++++++++++++++++++
 .../setup.py                                  |   4 +-
 .../test/train_full_1p.sh                     |   4 +-
 .../test/train_full_8p.sh                     |   4 +-
 4 files changed, 150 insertions(+), 6 deletions(-)
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
new file mode 100644
index 0000000000..9a91f1509c
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
@@ -0,0 +1,144 @@
+# Fairseq Transformer for PyTorch
+
+-   [概述](#概述)
+-   [准备训练环境](#准备训练环境)
+-   [开始训练](#开始训练)
+-   [训练结果展示](#训练结果展示)
+-   [版本说明](#版本说明)
+
+# 概述
+
+## 简述
+Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上层的封装，
+让用户可以设计自己的模型来完成不同的NLP任务，如：机器翻译、文本摘要、语言模型等等
+并提供了主流的模型架构，例如Bert, Transformer的实现。
+
+本仓库为Transformer模型的文本翻译任务。
+
+- 参考实现：
+
+  ```
+  url=https://github.com/facebookresearch/fairseq.git
+  commit_id=3f6ba43f07a6e9e2acf957fc24e57251a7a3f55c
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/dev/perf
+  ```
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的 PyTorch 版本和已知三方库依赖如下表所示。
+
+  **表 1**  版本支持表
+
+  | Torch_Version |                                                                三方库依赖版本                                                               |
+  |:------------------------------------------------------------------------------------------------------------------------------------:| :----------------------------------------------------------: |
+  | PyTorch 1.11  |cffi, cython, hydra-core>=1.0.7,<1.1, omegaconf<2.1, numpy>=1.21.3, regex, sacrebleu>=1.4.12, tqdm, bitarray, scikit-learn, packaging|
+  
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖。
+ 
+在模型源码包根目录下执行命令，安装Fairseq工具库。
+  ```
+  pip install --editable ./
+  ```
+
+## 准备数据集
+
+1. 获取数据集。
+
+   用户自行获取原始数据集WMT18。将数据集上传到服务器并按照需求进行数据处理。
+   数据处理部分可以参考原仓库examples/translation下的数据处理脚本
+
+   以处理好的WMT18数据集为例，数据集目录结构参考如下所示。
+
+   ```
+   ├── data-bin
+         ├──wmt18_en_de
+              ├──train.en-de.de.idx
+              ├──train.en-de.de.bin
+              ├──... 
+   ```
+
+   > **说明：** 
+   > 该数据集的训练过程脚本只作为一种参考示例。
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录。
+
+   ```
+   cd /${模型文件夹名称} 
+   ```
+
+2. 运行训练脚本。
+
+   该模型支持单机单卡训练和单机8卡训练。
+
+   - 单机单卡卡训练
+
+     启动单卡训练。
+
+     ```
+     bash ./test/train_full_1p.sh <path-to-dataset> <path-to-checkpoint-save-dir>
+     ```
+   - 单机8卡卡训练
+
+     启动8卡训练。
+
+     ```
+     bash ./test/train_full_8p.sh <path-to-dataset> <path-to-checkpoint-save-dir>
+     ```
+
+   path-to-dataset表示imagenet的数据集路径，需填写到数据集的一级目录。
+
+   模型训练脚本参数说明如下。
+
+   ```
+   公共参数：
+   --distributed-world-size            //使用的计算单元数量
+   --arch                              //模型结构  
+   --optimizer                         //优化器选择 
+   --clip-norm                         //梯度裁剪阈值
+   --lr                                //学习率设置
+   --lr-schedule                       //学习率调整策略
+   --dropout                           //掩码层概率设置
+   --weight-decay                      //梯度正则化设置
+   --criterion                         //Loss计算标准
+   --max-tokens                        //一个batch中的最大token数量
+   --eval-bleu                         //使用bleu指标进行评估
+   --best-checkpoint-metric            //定义最佳模型的衡量指标
+   --max-expoch                        //最大训练epoch数量
+   --save-dir                          //模型文件保存路径
+   ```
+   
+   训练完成后，权重文件保存在当前路径下，并输出模型训练精度和性能信息。
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+| NAME    | BLEU  | WPS  | Epochs | Data_Type | Torch_version |
+| :-----: |:-----:|:----:|:------:|:---------:|:-------------:|
+| 8p-竞品V | 41.12 | 334K |   20   |   FP32    |     1.11      |
+| 8p-NPU  | 41.21 | 223K |   20   |    FP32     |     1.11      |
+
+# 版本说明
+
+## 变更
+
+2023.06.12：首次发布。
+
+## FAQ
+无
\ No newline at end of file
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
index f50d3db308..0e439952b6 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/setup.py
@@ -213,10 +213,10 @@ def do_setup(package_data):
             "numpy>=1.21.3",
             "regex",
             "sacrebleu>=1.4.12",
-            "torch>=1.13",
+            # "torch>=1.13",
             "tqdm",
             "bitarray",
-            "torchaudio>=0.8.0",
+            # "torchaudio>=0.8.0",
             "scikit-learn",
             "packaging",
         ],
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
index cbc9ed0546..23292c5df9 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
@@ -18,7 +18,7 @@ export PYTHONPATH=$(pwd)/../fairseq:$PYTHONPATH
 LOG_FILE=train_1p_npu.log
 
 fairseq-train \
-    data-bin/wmt18_en_de \
+    $1 \
     --distributed-world-size 1 \
     --arch transformer_wmt_en_de --share-decoder-input-output-embed \
     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
@@ -33,7 +33,7 @@ fairseq-train \
     --eval-bleu-print-samples \
     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
     --max-expoch 30 \
-    --save-dir checkpoints/zhuance > ${LOG_FILE} 2>&1 &
+    --save-dir $2 > ${LOG_FILE} 2>&1 &
 
 if [ ! -f ${LOG_FILE}]; then
     echo "No log file"
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
index 2de48343f0..1618b15d7d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
@@ -20,7 +20,7 @@ export _DEFAULT_FIRST_BUCKET_BYTES=50
 LOG_FILE=train_8p_npu.log
 
 fairseq-train \
-    data-bin/wmt18_en_de \
+    $1 \
     --distributed-world-size 8 \
     --arch transformer_wmt_en_de --share-decoder-input-output-embed \
     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
@@ -35,7 +35,7 @@ fairseq-train \
     --eval-bleu-print-samples \
     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
     --max-expoch 30 \
-    --save-dir checkpoints/zhuance --bucket-cap-mb=300 > ${LOG_FILE} 2>&1 &
+    --save-dir $2 --bucket-cap-mb=300 > ${LOG_FILE} 2>&1 &
 
 if [ ! -f ${LOG_FILE} ]; then
     echo "No log file"
-- 
Gitee


From 618cfb87351f1ba520262eb7dabf62e1767df271 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Mon, 12 Jun 2023 17:27:25 +0800
Subject: [PATCH 07/12] cpp license

---
 .../clib/libbase/balanced_assignment.cpp      | 34 +++++++++++++++++--
 .../fairseq/clib/libbleu/libbleu.cpp          | 34 +++++++++++++++++--
 .../fairseq/clib/libbleu/module.cpp           | 34 +++++++++++++++++--
 .../fairseq/clib/libnat/edit_dist.cpp         | 34 +++++++++++++++++--
 4 files changed, 124 insertions(+), 12 deletions(-)

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp
index 1a5a1061f3..7cd4d6ec6f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbase/balanced_assignment.cpp
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 /*
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp
index 939d9e1174..e9a3b30667 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/libbleu.cpp
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #include <array>
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp
index 35288b3177..c186a84966 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libbleu/module.cpp
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #include <Python.h>
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp
index 9ffb60569d..7f3aaceb67 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/clib/libnat/edit_dist.cpp
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #include <pybind11/detail/common.h>
-- 
Gitee


From 04f34ed9f57af9c03fb5e3c7ce95d8259fc2a6a1 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Mon, 12 Jun 2023 20:40:58 +0800
Subject: [PATCH 08/12] run fix

---
 .../operators/alignment_train_cpu.cpp         | 166 ++++++++
 .../operators/alignment_train_cuda.cpp        |  31 ++
 .../examples/operators/alignment_train_cuda.h |  16 +
 .../operators/alignment_train_kernel.cu       | 354 ++++++++++++++++++
 .../examples/operators/utils.h                |  19 +
 .../fairseq/optim/adam.py                     | 223 ++++++-----
 .../fairseq/optim/fairseq_optimizer.py        |   2 -
 .../fairseq/trainer.py                        |   9 -
 .../test/train_full_1p.sh                     |   8 +-
 .../test/train_full_8p.sh                     |   6 +-
 10 files changed, 702 insertions(+), 132 deletions(-)
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp
new file mode 100644
index 0000000000..13c015308e
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp
@@ -0,0 +1,166 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h> // @manual=//caffe2:torch_extension
+#include <algorithm>
+
+namespace {
+
+template <typename T>
+void exclusiveCumprod(
+    const T* p_choose,
+    T* cumprod_1mp,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len) {
+  // cumprod_1mp = 1 - p_choose
+  for (uint32_t b = 0; b < bsz; b++) {
+    for (uint32_t tgt = 0; tgt < tgt_len; tgt++) {
+      for (uint32_t src = 0; src < src_len; src++) {
+        uint32_t idx = b * tgt_len * src_len + tgt * src_len + src;
+        cumprod_1mp[idx] = 1 - p_choose[idx];
+      }
+    }
+  }
+
+  // Implementing exclusive cumprod in the innermost dimension
+  // cumprod_1mp = cumprod(1 - p_choose)
+  // There is cumprod in pytorch, however there is no exclusive mode.
+  // cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i]
+  // exclusive means
+  // cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i]
+  for (uint32_t b = 0; b < bsz; b++) {
+    for (uint32_t tgt = 0; tgt < tgt_len; tgt++) {
+      uint32_t idx_offset = b * tgt_len * src_len + tgt * src_len;
+      T prev = cumprod_1mp[idx_offset];
+      // index [b][tgt][0]
+      cumprod_1mp[idx_offset] = (T)1.0;
+      T curr;
+      for (uint32_t src = 1; src < src_len; src++) {
+        uint32_t idx = idx_offset + src;
+        curr = cumprod_1mp[idx];
+        cumprod_1mp[idx] = cumprod_1mp[idx - 1] * prev;
+        prev = curr;
+      }
+    }
+  }
+}
+
+template <typename T>
+void clamp(
+    const T* cumprod_1mp,
+    T* cumprod_1mp_clamp,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len,
+    T min_val,
+    T max_val) {
+  for (uint32_t b = 0; b < bsz; b++) {
+    for (uint32_t tgt = 0; tgt < tgt_len; tgt++) {
+      for (uint32_t src = 0; src < src_len; src++) {
+        uint32_t idx = b * tgt_len * src_len + tgt * src_len + src;
+        if (cumprod_1mp[idx] < min_val) {
+          cumprod_1mp_clamp[idx] = min_val;
+        } else if (cumprod_1mp[idx] > max_val) {
+          cumprod_1mp_clamp[idx] = max_val;
+        } else {
+          cumprod_1mp_clamp[idx] = cumprod_1mp[idx];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void alignmentTrainCPUImpl(
+    const T* p_choose,
+    T* alpha,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len,
+    float eps) {
+  // p_choose: bsz , tgt_len, src_len
+  // cumprod_1mp: bsz , tgt_len, src_len
+  // cumprod_1mp_clamp : bsz, tgt_len, src_len
+  // alpha: bsz + 1, tgt_len, src_len
+
+  uint32_t elements = bsz * tgt_len * src_len;
+  T* cumprod_1mp = new T[elements];
+  T* cumprod_1mp_clamp = new T[elements];
+
+  exclusiveCumprod<T>(p_choose, cumprod_1mp, bsz, tgt_len, src_len);
+  clamp<T>(
+      cumprod_1mp, cumprod_1mp_clamp, bsz, tgt_len, src_len, (T)eps, (T)1.0);
+
+  // ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi))
+
+  // Initialize alpha [:, 0, 0]
+  for (uint32_t b = 0; b < bsz; b++) {
+    alpha[b * tgt_len * src_len] = 1.0;
+  }
+
+  for (uint32_t tgt = 0; tgt < tgt_len; tgt++) {
+    for (uint32_t b = 0; b < bsz; b++) {
+      uint32_t alpha_idx, inout_idx;
+      T prev_scan = 0, curr_scan, out;
+      for (uint32_t src = 0; src < src_len; src++) {
+        // Apply scan/cumsum
+        if (tgt == 0) {
+          // alpha index is [b][tgt][src]
+          alpha_idx = b * tgt_len * src_len + src;
+        } else {
+          // alpha index is [b][tgt-1][src]
+          alpha_idx = b * tgt_len * src_len + (tgt - 1) * src_len + src;
+        }
+        // input index is [b][tgt][src]
+        inout_idx = b * tgt_len * src_len + tgt * src_len + src;
+        curr_scan = prev_scan + alpha[alpha_idx] / cumprod_1mp_clamp[inout_idx];
+
+        out = curr_scan * p_choose[inout_idx] * cumprod_1mp[inout_idx];
+        alpha[inout_idx] = std::min<T>(std::max<T>(out, 0), 1.0);
+        prev_scan = curr_scan;
+      }
+    }
+  }
+
+  free(cumprod_1mp);
+  free(cumprod_1mp_clamp);
+}
+
+void alignmentTrainCPU(
+    const torch::Tensor& p_choose,
+    torch::Tensor& alpha,
+    float eps) {
+  uint32_t bsz = p_choose.size(0);
+  uint32_t tgt_len = p_choose.size(1);
+  uint32_t src_len = p_choose.size(2);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      torch::ScalarType::Half,
+      torch::ScalarType::BFloat16,
+      p_choose.scalar_type(),
+      "alignmentCPUImpl",
+      [&]() {
+        alignmentTrainCPUImpl<scalar_t>(
+            p_choose.data_ptr<scalar_t>(),
+            alpha.data_ptr<scalar_t>(),
+            bsz,
+            tgt_len,
+            src_len,
+            eps);
+      });
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "alignment_train_cpu",
+      &alignmentTrainCPU,
+      "expected_alignment_from_p_choose (CPU)");
+}
+
+} // namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp
new file mode 100644
index 0000000000..430e048139
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "alignment_train_cuda.h"
+#include "utils.h"
+
+namespace {
+
+void alignmentTrainCUDA(
+    const torch::Tensor& p_choose,
+    torch::Tensor& alpha,
+    float eps) {
+  CHECK_INPUT(p_choose);
+  CHECK_INPUT(alpha);
+
+  alignmentTrainCUDAWrapper(p_choose, alpha, eps);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "alignment_train_cuda",
+      &alignmentTrainCUDA,
+      "expected_alignment_from_p_choose (CUDA)");
+}
+
+} // namespace
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h
new file mode 100644
index 0000000000..8289d1a690
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h
@@ -0,0 +1,16 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <torch/extension.h> // @manual=//caffe2:torch_extension
+
+void alignmentTrainCUDAWrapper(
+    const torch::Tensor& p_choose,
+    torch::Tensor& alpha,
+    float eps);
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu
new file mode 100644
index 0000000000..efae7cc76f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu
@@ -0,0 +1,354 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h> // @manual=//caffe2/aten:ATen-cu
+#include <cuda_runtime.h>
+#include <algorithm> // std::min/max
+#include <cub/cub.cuh>
+
+#include "alignment_train_cuda.h"
+#include "utils.h"
+
+namespace {
+
+// The thread block length in threads along the X dimension
+constexpr int BLOCK_DIM_X = 128;
+// The thread block length in threads along the Y dimension
+constexpr int BLOCK_DIM_Y = 8;
+// The thread block length in threads for scan operation
+constexpr int SCAN_BLOCK = 512;
+
+#define gpuErrchk(ans) \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+
+inline void
+gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(
+        stderr,
+        "\nGPUassert: %s %s %d\n",
+        cudaGetErrorString(code),
+        file,
+        line);
+    if (abort)
+      exit(code);
+  }
+}
+
+template <typename T>
+struct Prod {
+  /// prod operator, returns <tt>a * b</tt>
+  __host__ __device__ __forceinline__ T
+  operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T>
+struct BlockPrefixProdCallbackOp {
+  // Running prefix
+  T running_total;
+
+  // Constructor
+  __device__ BlockPrefixProdCallbackOp(T running_total)
+      : running_total(running_total) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ T operator()(const T block_aggregate) {
+    T old_prefix = running_total;
+    running_total *= block_aggregate;
+    return old_prefix;
+  }
+};
+
+template <typename T>
+struct BlockPrefixSumCallbackOp {
+  // Running prefix
+  T running_total;
+
+  // Constructor
+  __device__ BlockPrefixSumCallbackOp(T running_total)
+      : running_total(running_total) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ T operator()(const T block_aggregate) {
+    T old_prefix = running_total;
+    running_total += block_aggregate;
+    return old_prefix;
+  }
+};
+
+template <typename T>
+__global__ void oneMinusPKernel(
+    const T* __restrict__ p_choose,
+    T* __restrict__ cumprod_1mp,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len) {
+  for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) {
+    for (uint32_t tgt = threadIdx.y; tgt < tgt_len; tgt += blockDim.y) {
+      for (uint32_t src = threadIdx.x; src < src_len; src += blockDim.x) {
+        uint32_t idx = b * tgt_len * src_len + tgt * src_len + src;
+        cumprod_1mp[idx] = 1 - p_choose[idx];
+      }
+    }
+  }
+}
+
+template <typename T, int TPB>
+__global__ void innermostScanKernel(
+    T* __restrict__ cumprod_1mp,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len) {
+  for (uint32_t b = blockIdx.y; b < bsz; b += gridDim.y) {
+    for (uint32_t tgt = blockIdx.x; tgt < tgt_len; tgt += gridDim.x) {
+      // Specialize BlockScan for a 1D block of TPB threads on type T
+      typedef cub::BlockScan<T, TPB> BlockScan;
+      // Allocate shared memory for BlockScan
+      __shared__ typename BlockScan::TempStorage temp_storage;
+      // Initialize running total
+      BlockPrefixProdCallbackOp<T> prefix_op(1);
+
+      const uint32_t tid = threadIdx.x;
+      for (uint32_t block_src = 0; block_src < src_len;
+           block_src += blockDim.x) {
+        uint32_t src = block_src + tid;
+        uint32_t idx = b * tgt_len * src_len + tgt * src_len + src;
+        T thread_data = (src < src_len) ? cumprod_1mp[idx] : (T)0;
+
+        // Collectively compute the block-wide inclusive prefix sum
+        BlockScan(temp_storage)
+            .ExclusiveScan(thread_data, thread_data, Prod<T>(), prefix_op);
+        __syncthreads();
+
+        // write the scanned value to output
+        if (src < src_len) {
+          cumprod_1mp[idx] = thread_data;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void clampKernel(
+    const T* __restrict__ cumprod_1mp,
+    T* __restrict__ cumprod_1mp_clamp,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len,
+    T min_val,
+    T max_val) {
+  for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) {
+    for (uint32_t tgt = threadIdx.y; tgt < tgt_len; tgt += blockDim.y) {
+      for (uint32_t src = threadIdx.x; src < src_len; src += blockDim.x) {
+        uint32_t idx = b * tgt_len * src_len + tgt * src_len + src;
+        if (cumprod_1mp[idx] < min_val) {
+          cumprod_1mp_clamp[idx] = min_val;
+        } else if (cumprod_1mp[idx] > max_val) {
+          cumprod_1mp_clamp[idx] = max_val;
+        } else {
+          cumprod_1mp_clamp[idx] = cumprod_1mp[idx];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void initAlphaCUDAKernel(
+    T* alpha,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len) {
+  // alpha[:, 0, 0] = 1.0
+  for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) {
+    alpha[b * tgt_len * src_len] = (T)1.0;
+  }
+}
+
+template <typename T, int TPB>
+__global__ void alignmentTrainCUDAKernel(
+    const T* __restrict__ p_choose,
+    const T* __restrict__ cumprod_1mp,
+    const T* __restrict__ cumprod_1mp_clamp,
+    T* __restrict__ alpha,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len,
+    uint32_t tgt) {
+  for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) {
+    // Specialize BlockScan for a 1D block of TPB threads on type T
+    typedef cub::BlockScan<T, TPB> BlockScan;
+
+    // Allocate shared memory for BlockScan
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    // Initialize running total
+    BlockPrefixSumCallbackOp<T> prefix_op(0);
+
+    uint32_t b_offset = b * tgt_len * src_len;
+    const uint32_t tid = threadIdx.x;
+    for (uint32_t block_src = 0; block_src < src_len; block_src += blockDim.x) {
+      uint32_t src = block_src + tid;
+      // Obtain a segment of consecutive items that are blocked across threads
+      uint32_t inout_idx, alpha_idx;
+      if (tgt == 0) {
+        // both alpha and other input index is [b][0][src]
+        alpha_idx = b_offset + src;
+      } else {
+        // alpha index is [b][tgt-1][src]
+        alpha_idx = b_offset + (tgt - 1) * src_len + src;
+      }
+      inout_idx = b_offset + tgt * src_len + src;
+      T thread_data = (T)0;
+      if (src < src_len) {
+        thread_data = alpha[alpha_idx] / cumprod_1mp_clamp[inout_idx];
+      }
+
+      // Collectively compute the block-wide inclusive prefix sum
+      BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, prefix_op);
+      __syncthreads();
+
+      if (src < src_len) {
+        T out = thread_data * p_choose[inout_idx] * cumprod_1mp[inout_idx];
+        // Clamps all elements into the range [ 0, 1.0 ]
+        alpha[inout_idx] = std::min<T>(std::max<T>(out, 0), (T)1.0);
+      }
+    }
+  }
+}
+
+template <typename T>
+void exclusiveCumprod(
+    const T* p_choose,
+    T* cumprod_1mp,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len,
+    uint32_t max_grid_x,
+    uint32_t max_grid_y,
+    cudaStream_t& stream) {
+  // cumprod_1mp = 1 - p_choose
+  dim3 grid(std::min<T>(max_grid_x, bsz), 1, 1);
+  dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+  oneMinusPKernel<T><<<grid, block, 0, stream>>>(
+      p_choose, cumprod_1mp, bsz, tgt_len, src_len);
+  gpuErrchk(cudaGetLastError());
+
+  // scan on the innermost dimension of cumprod_1mp
+  // cumprod_1mp = cumprod(cumprod_1mp)
+  dim3 grid_scan(
+      std::min<T>(max_grid_x, tgt_len), std::min<T>(max_grid_y, bsz), 1);
+  innermostScanKernel<T, SCAN_BLOCK><<<grid_scan, SCAN_BLOCK, 0, stream>>>(
+      cumprod_1mp, bsz, tgt_len, src_len);
+  gpuErrchk(cudaGetLastError());
+}
+
+template <typename T>
+void alignmentTrainCUDAImpl(
+    const T* p_choose,
+    T* alpha,
+    uint32_t bsz,
+    uint32_t tgt_len,
+    uint32_t src_len,
+    float eps) {
+  // p_choose: bsz , tgt_len, src_len
+  // cumprod_1mp: bsz , tgt_len, src_len
+  // cumprod_1mp_clamp : bsz, tgt_len, src_len
+  // alpha: bsz, tgt_len, src_len
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  uint32_t max_grid_x = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
+  uint32_t max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+
+  // Implementing exclusive cumprod.
+  // cumprod_1mp = cumprod(1 - p_choose)
+  // There is cumprod in pytorch, however there is no exclusive mode.
+  // cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i]
+  // exclusive means
+  // cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i]
+  uint32_t elements = bsz * tgt_len * src_len;
+  T* cumprod_1mp;
+  gpuErrchk(cudaMalloc(&cumprod_1mp, elements * sizeof(T)));
+  exclusiveCumprod<T>(
+      p_choose,
+      cumprod_1mp,
+      bsz,
+      tgt_len,
+      src_len,
+      max_grid_x,
+      max_grid_y,
+      stream);
+
+  // clamp cumprod_1mp to the range [eps, 1.0]
+  T* cumprod_1mp_clamp;
+  gpuErrchk(cudaMalloc(&cumprod_1mp_clamp, elements * sizeof(T)));
+  dim3 grid_clamp(std::min<T>(max_grid_x, bsz), 1, 1);
+  dim3 block_clamp(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
+  clampKernel<T><<<grid_clamp, block_clamp, 0, stream>>>(
+      cumprod_1mp, cumprod_1mp_clamp, bsz, tgt_len, src_len, (T)eps, (T)1.0);
+  gpuErrchk(cudaGetLastError());
+
+  // ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi))
+  dim3 grid_init(std::min<int>(max_grid_x, bsz), 1, 1);
+  initAlphaCUDAKernel<T>
+      <<<grid_init, 1, 0, stream>>>(alpha, bsz, tgt_len, src_len);
+  gpuErrchk(cudaGetLastError());
+
+  const int grid = std::min(bsz, max_grid_x);
+
+  for (uint32_t i = 0; i < tgt_len; i++) {
+    alignmentTrainCUDAKernel<T, SCAN_BLOCK><<<grid, SCAN_BLOCK, 0, stream>>>(
+        p_choose,
+        cumprod_1mp,
+        cumprod_1mp_clamp,
+        alpha,
+        bsz,
+        tgt_len,
+        src_len,
+        i);
+    gpuErrchk(cudaGetLastError());
+  }
+
+  gpuErrchk(cudaFree(cumprod_1mp));
+  gpuErrchk(cudaFree(cumprod_1mp_clamp));
+}
+
+} // namespace
+
+void alignmentTrainCUDAWrapper(
+    const torch::Tensor& p_choose,
+    torch::Tensor& alpha,
+    float eps) {
+  // p_choose dimension: bsz, tgt_len, src_len
+  uint32_t bsz = p_choose.size(0);
+  uint32_t tgt_len = p_choose.size(1);
+  uint32_t src_len = p_choose.size(2);
+
+  cudaSetDevice(p_choose.get_device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      torch::ScalarType::Half,
+      torch::ScalarType::BFloat16,
+      p_choose.scalar_type(),
+      "alignmentTrainCUDAImpl",
+      [&]() {
+        alignmentTrainCUDAImpl<scalar_t>(
+            p_choose.data_ptr<scalar_t>(),
+            alpha.data_ptr<scalar_t>(),
+            bsz,
+            tgt_len,
+            src_len,
+            eps);
+      });
+}
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h
new file mode 100644
index 0000000000..0ef5b4383f
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <torch/extension.h> // @manual=//caffe2:torch_extension
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
index 14bcbd797d..eaee70bcc1 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/adam.py
@@ -128,14 +128,14 @@ class FairseqAdam(FairseqOptimizer):
     def average_params(self):
         """Reduce Params is only used during BMUF distributed training."""
 
-    state_dict = self.optimizer.state_dict()
-    total_gpus = float(dist.get_world_size())
+        state_dict = self.optimizer.state_dict()
+        total_gpus = float(dist.get_world_size())
 
-    for _, value in state_dict["state"].items():
-        value["exp_avg"] /= total_gpus
-        value["exp_avg_sq"] /= total_gpus
-        dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
-        dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
+        for _, value in state_dict["state"].items():
+            value["exp_avg"] /= total_gpus
+            value["exp_avg_sq"] /= total_gpus
+            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
+            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
 
 
 class Adam(torch.optim.Optimizer):
@@ -335,140 +335,139 @@ class Adam(torch.optim.Optimizer):
 
     def _check_initialized(self, groups):
         def _one_kind_initialized(pair):
+            return pair and (pair[0] is not None or pair[1] is not None)
+        return groups and any(map(_one_kind_initialized, groups))
 
-    return pair and (pair[0] is not None or pair[1] is not None)
-    return groups and any(map(_one_kind_initialized, groups))
 
+    def _maybe_init_combine_states(self):
+        if self._check_initialized(self._combined_state_indexed_by_group):
+            return
 
-def _maybe_init_combine_states(self):
-    if self._check_initialized(self._combined_state_indexed_by_group):
-        return
+        self._combined_state_indexed_by_group = list(map(
+            self._combine_group_param_states,
+            range(len(self.param_groups))))
 
-    self._combined_state_indexed_by_group = list(map(
-        self._combine_group_param_states,
-        range(len(self.param_groups))))
 
+    def _combine_params_and_grads(self, group_index):
+        group = self.param_groups[group_index]
+        params = group['params']
 
-def _combine_params_and_grads(self, group_index):
-    group = self.param_groups[group_index]
-    params = group['params']
-
-    group_params_list = [[], []]
-    group_grads_list = [[], []]
+        group_params_list = [[], []]
+        group_grads_list = [[], []]
 
-    for p in params:
-        if p.grad is None:
-            continue
-        param_size = p.storage().size()
-        grad_size = p.grad.storage().size()
+        for p in params:
+            if p.grad is None:
+                continue
+            param_size = p.storage().size()
+            grad_size = p.grad.storage().size()
 
-        if param_size != grad_size:
-            p.grad.data = p.grad.data.clone()
+            if param_size != grad_size:
+                p.grad.data = p.grad.data.clone()
 
-        kind = 0 if hasattr(p, 'expert') else 1
-        group_params_list[kind].append(p)
-        group_grads_list[kind].append(p.grad)
+            kind = 0 if hasattr(p, 'expert') else 1
+            group_params_list[kind].append(p)
+            group_grads_list[kind].append(p.grad)
 
-    combined_params = [None, None]
-    combined_grads = [None, None]
-    for kind in range(len(combined_params)):
-        combined_params[kind] = npu_combine_tensors(group_params_list[kind])
-        combined_grads[kind] = npu_combine_tensors(group_grads_list[kind])
+        combined_params = [None, None]
+        combined_grads = [None, None]
+        for kind in range(len(combined_params)):
+            combined_params[kind] = npu_combine_tensors(group_params_list[kind])
+            combined_grads[kind] = npu_combine_tensors(group_grads_list[kind])
 
-    return combined_params, combined_grads
+        return combined_params, combined_grads
 
 
-def _maybe_init_combine_params_and_grads(self):
-    if self._check_initialized(self._combined_params_indexed_by_group) and \
-            self._check_initialized(self._combined_grads_indexed_by_group):
-        return
+    def _maybe_init_combine_params_and_grads(self):
+        if self._check_initialized(self._combined_params_indexed_by_group) and \
+                self._check_initialized(self._combined_grads_indexed_by_group):
+            return
 
-    self._combined_params_indexed_by_group = []
-    self._combined_grads_indexed_by_group = []
+        self._combined_params_indexed_by_group = []
+        self._combined_grads_indexed_by_group = []
 
-    for group_index in range(len(self.param_groups)):
-        combined_params, combined_grads = self._combine_params_and_grads(group_index)
-        self._combined_params_indexed_by_group.append(combined_params)
-        self._combined_grads_indexed_by_group.append(combined_grads)
+        for group_index in range(len(self.param_groups)):
+            combined_params, combined_grads = self._combine_params_and_grads(group_index)
+            self._combined_params_indexed_by_group.append(combined_params)
+            self._combined_grads_indexed_by_group.append(combined_grads)
 
 
-@torch.no_grad()
-def zero_grad(self, set_to_none=False):
-    if set_to_none:
-        raise ValueError(
-            "set_to_none is not supported in fused optimizers")
+    @torch.no_grad()
+    def zero_grad(self, set_to_none=False):
+        if set_to_none:
+            raise ValueError(
+                "set_to_none is not supported in fused optimizers")
 
-    self._maybe_init_combine_params_and_grads()
+        self._maybe_init_combine_params_and_grads()
 
-    for grad_pair in self._combined_grads_indexed_by_group:
-        for grad in grad_pair:
-            if grad is None:
-                continue
-            grad.zero_()
-
-
-@torch.no_grad()
-def clip_grad_norm_(self, params, max_norm, aggregate_norm_fn=None) -> torch.Tensor:
-    def grad_exists(p):
-        return p is not None and getattr(p, "grad", None) is not None
-
-    if isinstance(params, torch.Tensor):
-        params = [params]
-    params = list(params)
-    grads = [
-        p.grad for p in params if grad_exists(p) and not hasattr(p, "expert")
-    ]
-    expert_grads = [
-        p.grad for p in params if grad_exists(p) and hasattr(p, "expert")
-    ]
-
-    if len(grads) == 0:
-        if len(params) > 0:
-            return params[0].new_tensor(0.0)
-        else:
-            return torch.tensor(0.0)
-
-    if len(grads) == 1:
-        total_norm = torch.norm(grads[0].detach(), p=2, dtype=torch.float32)
-    else:
-        if torch.cuda.is_available():
-            device = torch.cuda.current_device()
-        elif grads[0].device.type == "xla":
-            device = grads[0].device
-        else:
-            device = torch.device("cpu")
+        for grad_pair in self._combined_grads_indexed_by_group:
+            for grad in grad_pair:
+                if grad is None:
+                    continue
+                grad.zero_()
 
-        combined_grads_without_expert = self._combined_grads_indexed_by_group[0][1]
 
-        if combined_grads_without_expert is None:
+    @torch.no_grad()
+    def clip_grad_norm_(self, params, max_norm, aggregate_norm_fn=None) -> torch.Tensor:
+        def grad_exists(p):
+            return p is not None and getattr(p, "grad", None) is not None
+
+        if isinstance(params, torch.Tensor):
+            params = [params]
+        params = list(params)
+        grads = [
+            p.grad for p in params if grad_exists(p) and not hasattr(p, "expert")
+        ]
+        expert_grads = [
+            p.grad for p in params if grad_exists(p) and hasattr(p, "expert")
+        ]
+
+        if len(grads) == 0:
             if len(params) > 0:
                 return params[0].new_tensor(0.0)
             else:
                 return torch.tensor(0.0)
 
-        total_norm = torch.norm(combined_grads_without_expert, p=2, dtype=torch.float32).to(device)
+        if len(grads) == 1:
+            total_norm = torch.norm(grads[0].detach(), p=2, dtype=torch.float32)
+        else:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            elif grads[0].device.type == "xla":
+                device = grads[0].device
+            else:
+                device = torch.device("cpu")
+
+            combined_grads_without_expert = self._combined_grads_indexed_by_group[0][1]
 
-    if aggregate_norm_fn is not None:
-        total_norm = aggregate_norm_fn(total_norm)
+            if combined_grads_without_expert is None:
+                if len(params) > 0:
+                    return params[0].new_tensor(0.0)
+                else:
+                    return torch.tensor(0.0)
 
-    if max_norm > 0:
-        max_norm = float(max_norm)
-        clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1)
-        for combined_grad in self._combined_grads_indexed_by_group[0]:
-            combined_grad.mul_(clip_coef)
+            total_norm = torch.norm(combined_grads_without_expert, p=2, dtype=torch.float32).to(device)
+
+        if aggregate_norm_fn is not None:
+            total_norm = aggregate_norm_fn(total_norm)
 
-    return total_norm
+        if max_norm > 0:
+            max_norm = float(max_norm)
+            clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1)
+            for combined_grad in self._combined_grads_indexed_by_group[0]:
+                combined_grad.mul_(clip_coef)
 
+        return total_norm
 
-@torch.no_grad()
-def multiply_grads(self, c):
-    """Multiplies grads by a constant *c*."""
-    # combine tensors
-    self._maybe_init_combine_states()
-    self._maybe_init_combine_params_and_grads()
 
-    device = torch.cuda.current_device()
-    factor = c.to(device) if torch.is_tensor(c) else c
-    for combined_grad in self._combined_grads_indexed_by_group[0]:
-        if combined_grad is not None:
-            combined_grad.mul_(factor)
+    @torch.no_grad()
+    def multiply_grads(self, c):
+        """Multiplies grads by a constant *c*."""
+        # combine tensors
+        self._maybe_init_combine_states()
+        self._maybe_init_combine_params_and_grads()
+
+        device = torch.cuda.current_device()
+        factor = c.to(device) if torch.is_tensor(c) else c
+        for combined_grad in self._combined_grads_indexed_by_group[0]:
+            if combined_grad is not None:
+                combined_grad.mul_(factor)
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
index 1cc841406d..c33a612a35 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/optim/fairseq_optimizer.py
@@ -154,8 +154,6 @@ class FairseqOptimizer(object):
 
     def zero_grad(self):
         """Clears the gradients of all optimized parameters."""
-        for p in self.params:
-            p.grad = None
         self.optimizer.zero_grad()
 
     @property
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
index 089c9f94c1..9a65b4fe61 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/trainer.py
@@ -349,11 +349,6 @@ class Trainer(object):
                 self.cfg, params, allow_unsupported=allow_unsupported
             )
         elif self.cfg.common.fp16 or self.cfg.common.bf16 or self.cfg.common.amp:
-            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
-                logger.info(
-                    "NOTE: your device does NOT support faster training with --fp16 or --amp, "
-                    "please switch to FP32 which is likely to be faster"
-                )
             if (
                 self.cfg.common.memory_efficient_fp16
                 or self.cfg.common.memory_efficient_bf16
@@ -366,10 +361,6 @@ class Trainer(object):
             else:
                 self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params)
         else:
-            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
-                logger.info(
-                    "NOTE: your device may support faster training with --fp16 or --amp"
-                )
             self._optimizer = optim.build_optimizer(self.cfg.optimizer, params)
 
         if self.is_fsdp:
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
index 23292c5df9..f63623b0cb 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_1p.sh
@@ -13,8 +13,6 @@ else
     source ${CANN_INSTALL_PATH}/nnae/set_env.sh
 fi
 
-export PYTHONPATH=$(pwd)/../fairseq:$PYTHONPATH
-
 LOG_FILE=train_1p_npu.log
 
 fairseq-train \
@@ -32,10 +30,10 @@ fairseq-train \
     --eval-bleu-remove-bpe \
     --eval-bleu-print-samples \
     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
-    --max-expoch 30 \
-    --save-dir $2 > ${LOG_FILE} 2>&1 &
+    --max-epoch 20 \
+    --save-dir $2 > ${LOG_FILE} 2>&1
 
-if [ ! -f ${LOG_FILE}]; then
+if [ ! -f ${LOG_FILE} ]; then
     echo "No log file"
     exit -1
 fi
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
index 1618b15d7d..e90ca39889 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/test/train_full_8p.sh
@@ -13,8 +13,6 @@ else
     source ${CANN_INSTALL_PATH}/nnae/set_env.sh
 fi
 
-export PYTHONPATH=$(pwd)/../fairseq:$PYTHONPATH
-
 export _DEFAULT_FIRST_BUCKET_BYTES=50
 
 LOG_FILE=train_8p_npu.log
@@ -34,8 +32,8 @@ fairseq-train \
     --eval-bleu-remove-bpe \
     --eval-bleu-print-samples \
     --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
-    --max-expoch 30 \
-    --save-dir $2 --bucket-cap-mb=300 > ${LOG_FILE} 2>&1 &
+    --max-epoch 20 \
+    --save-dir $2 --bucket-cap-mb=300 > ${LOG_FILE} 2>&1
 
 if [ ! -f ${LOG_FILE} ]; then
     echo "No log file"
-- 
Gitee


From f9fa306b745a992093d9eb2ea804c676b405b1e6 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Tue, 13 Jun 2023 09:59:40 +0800
Subject: [PATCH 09/12] huawei license add

---
 .../LICENSE                                   | 29 +++++++++++++++
 .../operators/alignment_train_cpu.cpp         | 34 ++++++++++++++++--
 .../operators/alignment_train_cuda.cpp        | 34 ++++++++++++++++--
 .../examples/operators/alignment_train_cuda.h | 34 ++++++++++++++++--
 .../operators/alignment_train_kernel.cu       | 34 ++++++++++++++++--
 .../examples/operators/utils.h                | 34 ++++++++++++++++--
 .../fairseq/data/data_utils_fast.pyx          | 33 +++++++++++++++++
 .../fairseq/data/token_block_utils_fast.pyx   | 33 +++++++++++++++++
 .../dynamicconv_layer/dynamicconv_cuda.cpp    | 35 +++++++++++++++++--
 9 files changed, 282 insertions(+), 18 deletions(-)
 create mode 100644 PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/LICENSE

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/LICENSE b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/LICENSE
new file mode 100644
index 0000000000..ca94ebd754
--- /dev/null
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2023,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp
index 13c015308e..ee24caf70d 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cpu.cpp
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #include <torch/extension.h> // @manual=//caffe2:torch_extension
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp
index 430e048139..be69217b75 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.cpp
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #include "alignment_train_cuda.h"
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h
index 8289d1a690..8d9dd52b54 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_cuda.h
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #pragma once
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu
index efae7cc76f..4ea6268586 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/alignment_train_kernel.cu
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #include <ATen/ATen.h>
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h
index 0ef5b4383f..05d86230be 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/examples/operators/utils.h
@@ -1,9 +1,37 @@
 /**
- * Copyright 2017-present, Facebook, Inc.
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
  * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
  *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
  */
 
 #pragma once
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
index 16e5bae3e0..d33527a964 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/data_utils_fast.pyx
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # cython: language_level=3
 
 import numpy as np
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
index 4b23c4022c..8a4aea26f8 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/data/token_block_utils_fast.pyx
@@ -1,3 +1,36 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2023 xxxx
+# All rights reserved.
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
 # cython: language_level=3
 
 import numpy as np
diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
index 744c363e55..cf8abca4c3 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
@@ -1,8 +1,37 @@
 /**
- * Copyright (c) Facebook, Inc. and its affiliates.
  *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
+ * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
+ *
  */
 
 #include <torch/extension.h>
-- 
Gitee


From 65139f60d01860c6018d3f70ffe6d1ba02c6a7bf Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Tue, 13 Jun 2023 10:38:20 +0800
Subject: [PATCH 10/12] huawei license add

---
 .../dynamicconv_layer/dynamiconv_cpu.cpp      | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
index d7e57c8590..2d61935e9f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
@@ -1,3 +1,38 @@
+/**
+ *
+ * BSD 3-Clause License
+ *
+ * Copyright (c) 2023 xxxx
+ * All rights reserved.
+ * Copyright 2023 Huawei Technologies Co., Ltd
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ============================================================================
+ *
+ */
 #include <torch/torch.h>
 #include <vector>
 
-- 
Gitee


From 6906963881a778e03a8b25bb04968214f715ec88 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Tue, 13 Jun 2023 14:34:53 +0800
Subject: [PATCH 11/12] change

---
 .../README.md                                      | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
index 9a91f1509c..20ef45804f 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
@@ -10,7 +10,7 @@
 
 ## 简述
 Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上层的封装，
-让用户可以设计自己的模型来完成不同的NLP任务，如：机器翻译、文本摘要、语言模型等等
+让用户可以设计自己的模型来完成不同的NLP任务，如：机器翻译、文本摘要、语言模型等等，
 并提供了主流的模型架构，例如Bert, Transformer的实现。
 
 本仓库为Transformer模型的文本翻译任务。
@@ -47,9 +47,9 @@ Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上
   
 - 安装依赖。
  
-在模型源码包根目录下执行命令，安装Fairseq工具库。
+  在模型源码包根目录下执行命令，安装Fairseq工具库。
   ```
-  pip install --editable ./
+    pip install --editable ./
   ```
 
 ## 准备数据集
@@ -57,7 +57,7 @@ Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上
 1. 获取数据集。
 
    用户自行获取原始数据集WMT18。将数据集上传到服务器并按照需求进行数据处理。
-   数据处理部分可以参考原仓库examples/translation下的数据处理脚本
+   数据处理部分可以参考原仓库examples/translation下的数据处理脚本。
 
    以处理好的WMT18数据集为例，数据集目录结构参考如下所示。
 
@@ -101,7 +101,9 @@ Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上
      bash ./test/train_full_8p.sh <path-to-dataset> <path-to-checkpoint-save-dir>
      ```
 
-   path-to-dataset表示imagenet的数据集路径，需填写到数据集的一级目录。
+   path-to-dataset表示WMT18的数据集路径，需填写到数据集的二级目录（例如：`./data-bin/wmt18_en_de`)。
+   
+   path-to-checkpoint-save-dir表示生成的模型保存路径，可以用户自行配置。
 
    模型训练脚本参数说明如下。
 
@@ -123,7 +125,7 @@ Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上
    --save-dir                          //模型文件保存路径
    ```
    
-   训练完成后，权重文件保存在当前路径下，并输出模型训练精度和性能信息。
+   训练完成后，权重文件保存在用户制定的路径下，并输出模型训练精度和性能信息。
 
 # 训练结果展示
 
-- 
Gitee


From 15419ba66602940f0124765611978dab7d346fb6 Mon Sep 17 00:00:00 2001
From: uniteone <lantianxiang@huawei.com>
Date: Tue, 13 Jun 2023 14:37:30 +0800
Subject: [PATCH 12/12] change

---
 .../dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
index 20ef45804f..3ae0e2c31e 100644
--- a/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
+++ b/PyTorch/dev/perf/FairseqTransformer_iflytek_for_Pytorch/README.md
@@ -133,8 +133,8 @@ Fairseq是一个由Facebook开源的一个NLP的工具库，它是对Pytorch上
 
 | NAME    | BLEU  | WPS  | Epochs | Data_Type | Torch_version |
 | :-----: |:-----:|:----:|:------:|:---------:|:-------------:|
-| 8p-竞品V | 41.12 | 334K |   20   |   FP32    |     1.11      |
-| 8p-NPU  | 41.21 | 223K |   20   |    FP32     |     1.11      |
+| 8p-竞品V | 41.12 | 325K |   20   |   FP32    |     1.11      |
+| 8p-NPU  | 41.21 | 218K |   20   |    FP32     |     1.11      |
 
 # 版本说明
 
-- 
Gitee